From 5164002e4d2da07c6143c91ba627d7523526cdef Mon Sep 17 00:00:00 2001
From: "kaf24@firebug.cl.cam.ac.uk" <kaf24@firebug.cl.cam.ac.uk>
Date: Wed, 25 May 2005 10:36:59 +0000
Subject: [PATCH] bitkeeper revision 1.1548 (4294554btfa2GpomqV57KFpxEHsjEA)

Move to Linux's cpumask_t and 'hotplug' multi-processor booting
interfaces. This also brings apic.c and various other files closer to
their Linux 2.6 equivalents. Simplified the scheduler interfaces a
little (particularly per-cpu and idle-domain initialisation).
Signed-off-by: Keir Fraser <keir@xensource.com>
---
 xen/arch/ia64/domain.c          |    1 -
 xen/arch/ia64/xensetup.c        |    3 -
 xen/arch/x86/acpi/boot.c        |    1 -
 xen/arch/x86/apic.c             |  108 +-
 xen/arch/x86/cdb.c              |    2 +-
 xen/arch/x86/dom0_ops.c         |    4 +-
 xen/arch/x86/domain.c           |   27 +-
 xen/arch/x86/domain_build.c     |    2 +-
 xen/arch/x86/io_apic.c          |    2 +-
 xen/arch/x86/irq.c              |    5 +-
 xen/arch/x86/microcode.c        |    1 -
 xen/arch/x86/mtrr/main.c        |    2 -
 xen/arch/x86/nmi.c              |   16 +-
 xen/arch/x86/setup.c            |   75 +-
 xen/arch/x86/shadow.c           |    2 +-
 xen/arch/x86/smp.c              |    9 +-
 xen/arch/x86/smpboot.c          | 1757 ++++++++++++++-----------
 xen/arch/x86/time.c             |    3 +-
 xen/arch/x86/traps.c            |    1 +
 xen/arch/x86/vmx.c              |   16 +-
 xen/common/ac_timer.c           |    6 +-
 xen/common/dom0_ops.c           |    9 +-
 xen/common/domain.c             |    7 +-
 xen/common/page_alloc.c         |    6 +-
 xen/common/perfc.c              |   13 +-
 xen/common/sched_bvt.c          |   59 +-
 xen/common/sched_sedf.c         | 2179 +++++++++++++++----------------
 xen/common/schedule.c           |  152 +--
 xen/common/trace.c              |    4 +-
 xen/include/asm-x86/asm_defns.h |    2 +
 xen/include/asm-x86/bitops.h    |  138 +-
 xen/include/asm-x86/div64.h     |   39 +-
 xen/include/asm-x86/flushtlb.h  |    2 +-
 xen/include/asm-x86/irq.h       |   33 +-
 xen/include/asm-x86/processor.h |    1 +
 xen/include/xen/bitmap.h        |    1 +
 xen/include/xen/cpumask.h       |  381 +++++-
 xen/include/xen/kernel.h        |   24 +
 xen/include/xen/sched-if.h      |    8 -
 xen/include/xen/sched.h         |    4 -
 xen/include/xen/smp.h           |   34 +-
 41 files changed, 2823 insertions(+), 2316 deletions(-)

diff --git a/xen/arch/ia64/domain.c b/xen/arch/ia64/domain.c
index 8f12179c5d..2dff8d5fd2 100644
--- a/xen/arch/ia64/domain.c
+++ b/xen/arch/ia64/domain.c
@@ -124,7 +124,6 @@ void startup_cpu_idle_loop(void)
 {
 	/* Just some sanity to ensure that the scheduler is set up okay. */
 	ASSERT(current->domain == IDLE_DOMAIN_ID);
-	domain_unpause_by_systemcontroller(current->domain);
 	raise_softirq(SCHEDULE_SOFTIRQ);
 	do_softirq();
 
diff --git a/xen/arch/ia64/xensetup.c b/xen/arch/ia64/xensetup.c
index 605ac157ca..ba6cd64f94 100644
--- a/xen/arch/ia64/xensetup.c
+++ b/xen/arch/ia64/xensetup.c
@@ -249,13 +249,11 @@ printk("About to call sort_main_extable()\n");
     /* Create initial domain 0. */
 printk("About to call do_createdomain()\n");
     dom0 = do_createdomain(0, 0);
-printk("About to call init_idle_task()\n");
     init_task.domain = &idle0_domain;
     init_task.processor = 0;
 //    init_task.mm = &init_mm;
     init_task.domain->arch.mm = &init_mm;
 //    init_task.thread = INIT_THREAD;
-    init_idle_task();
     //arch_do_createdomain(current);
 #ifdef CLONE_DOMAIN0
     {
@@ -314,7 +312,6 @@ printk("About to call init_trace_bufs()\n");
     console_endboot(cmdline && strstr(cmdline, "tty0"));
 #endif
 
-    domain_unpause_by_systemcontroller(current->domain);
 #ifdef CLONE_DOMAIN0
     {
     int i;
diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c
index 79c35b8719..19f6147648 100644
--- a/xen/arch/x86/acpi/boot.c
+++ b/xen/arch/x86/acpi/boot.c
@@ -34,7 +34,6 @@
 #include <asm/io_apic.h>
 #include <asm/apic.h>
 #include <asm/io.h>
-#include <asm/irq.h>
 #include <asm/mpspec.h>
 #include <mach_apic.h>
 #include <mach_mpparse.h>
diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index cb4bd1fd73..86bdb6253c 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -663,7 +663,7 @@ void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound;
 
 #define APIC_DIVISOR 1
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+void __setup_APIC_LVTT(unsigned int clocks)
 {
     unsigned int lvtt_value, tmp_value, ver;
 
@@ -680,30 +680,33 @@ static void __setup_APIC_LVTT(unsigned int clocks)
     apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-/*
- * this is done for every CPU from setup_APIC_clocks() below.
- * We setup each local APIC with a zero timeout value for now.
- * Unlike Linux, we don't have to wait for slices etc.
- */
-void setup_APIC_timer(void * data)
+static void __init setup_APIC_timer(unsigned int clocks)
 {
     unsigned long flags;
-    __save_flags(flags);
-    __sti();
-    __setup_APIC_LVTT(0);
-    __restore_flags(flags);
+    
+    local_irq_save(flags);
+
+    /*
+     * Wait for IRQ0's slice:
+     */
+    wait_timer_tick();
+
+    __setup_APIC_LVTT(clocks);
+
+    local_irq_restore(flags);
 }
 
 /*
- * In this function we calibrate APIC bus clocks to the external timer.
- *
- * As a result we have the Bus Speed and CPU speed in Hz.
- * 
- * We want to do the calibration only once (for CPU0).  CPUs connected by the
- * same APIC bus have the very same bus frequency.
+ * In this function we calibrate APIC bus clocks to the external
+ * timer. Unfortunately we cannot use jiffies and the timer irq
+ * to calibrate, since some later bootup code depends on getting
+ * the first irq? Ugh.
  *
- * This bit is a bit shoddy since we use the very same periodic timer interrupt
- * we try to eliminate to calibrate the APIC. 
+ * We want to do the calibration only once since we
+ * want to have local timer irqs syncron. CPUs connected
+ * by the same APIC bus have the very same bus frequency.
+ * And we want to have irqs off anyways, no accidental
+ * APIC irq that way.
  */
 
 int __init calibrate_APIC_clock(void)
@@ -780,21 +783,48 @@ int __init calibrate_APIC_clock(void)
     return result;
 }
 
-/*
- * initialise the APIC timers for all CPUs
- * we start with the first and find out processor frequency and bus speed
- */
-void __init setup_APIC_clocks (void)
+
+static unsigned int calibration_result;
+
+void __init setup_boot_APIC_clock(void)
 {
+    apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
     using_apic_timer = 1;
-    __cli();
-    /* calibrate CPU0 for CPU speed and BUS speed */
-    bus_freq = calibrate_APIC_clock();
-    /* Now set up the timer for real. */
-    setup_APIC_timer((void *)bus_freq);
-    __sti();
-    /* and update all other cpus */
-    smp_call_function(setup_APIC_timer, (void *)bus_freq, 1, 1);
+
+    local_irq_disable();
+    
+    calibration_result = calibrate_APIC_clock();
+    /*
+     * Now set up the timer for real.
+     */
+    setup_APIC_timer(calibration_result);
+    
+    local_irq_enable();
+}
+
+void __init setup_secondary_APIC_clock(void)
+{
+    setup_APIC_timer(calibration_result);
+}
+
+void __init disable_APIC_timer(void)
+{
+    if (using_apic_timer) {
+        unsigned long v;
+        
+        v = apic_read(APIC_LVTT);
+        apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+    }
+}
+
+void enable_APIC_timer(void)
+{
+    if (using_apic_timer) {
+        unsigned long v;
+        
+        v = apic_read(APIC_LVTT);
+        apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
+    }
 }
 
 #undef APIC_DIVISOR
@@ -885,7 +915,7 @@ asmlinkage void smp_spurious_interrupt(struct cpu_user_regs *regs)
         ack_APIC_irq();
 
     /* see sw-dev-man vol 3, chapter 7.4.13.5 */
-    printk("spurious APIC interrupt on CPU#%d, should never happen.\n",
+    printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n",
            smp_processor_id());
 }
 
@@ -914,8 +944,8 @@ asmlinkage void smp_error_interrupt(struct cpu_user_regs *regs)
        6: Received illegal vector
        7: Illegal register address
     */
-    printk("APIC error on CPU%d: %02lx(%02lx)\n",
-            smp_processor_id(), v, v1);
+    printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+            smp_processor_id(), v , v1);
 }
 
 /*
@@ -940,20 +970,18 @@ int __init APIC_init_uniprocessor (void)
 
     connect_bsp_APIC();
 
-#ifdef CONFIG_SMP
-    cpu_online_map = 1;
-#endif
     phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
-    apic_write_around(APIC_ID, boot_cpu_physical_apicid);
 
     setup_local_APIC();
 
+    if (nmi_watchdog == NMI_LOCAL_APIC)
+        check_nmi_watchdog();
 #ifdef CONFIG_X86_IO_APIC
     if (smp_found_config)
         if (!skip_ioapic_setup && nr_ioapics)
             setup_IO_APIC();
 #endif
-    setup_APIC_clocks();
+    setup_boot_APIC_clock();
 
     return 0;
 }
diff --git a/xen/arch/x86/cdb.c b/xen/arch/x86/cdb.c
index 899493380f..f92e78f9c6 100644
--- a/xen/arch/x86/cdb.c
+++ b/xen/arch/x86/cdb.c
@@ -9,7 +9,7 @@
 #include <xen/lib.h>
 #include <asm/uaccess.h>
 #include <xen/serial.h>
-#include <asm/irq.h>
+#include <xen/irq.h>
 #include <xen/spinlock.h>
 #include <asm/debugger.h>
 #include <xen/init.h>
diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c
index 85fbe494f1..4232911978 100644
--- a/xen/arch/x86/dom0_ops.c
+++ b/xen/arch/x86/dom0_ops.c
@@ -176,8 +176,8 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op)
     {
         dom0_physinfo_t *pi = &op->u.physinfo;
 
-        pi->ht_per_core = opt_noht ? 1 : ht_per_core;
-        pi->cores       = smp_num_cpus / pi->ht_per_core;
+        pi->ht_per_core = ht_per_core;
+        pi->cores       = num_online_cpus() / ht_per_core;
         pi->total_pages = max_page;
         pi->free_pages  = avail_domheap_pages();
         pi->cpu_khz     = cpu_khz;
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index e046e9017d..30795b5831 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -73,44 +73,31 @@ static void default_idle(void)
 void idle_loop(void)
 {
     int cpu = smp_processor_id();
+
     for ( ; ; )
     {
         irq_stat[cpu].idle_timestamp = jiffies;
+
         while ( !softirq_pending(cpu) )
         {
             page_scrub_schedule_work();
             default_idle();
         }
+
         do_softirq();
     }
 }
 
-static void __startup_cpu_idle_loop(struct exec_domain *ed)
-{
-    /* Signal to boot CPU that we are done. */
-    init_idle();
-
-    /* Start normal idle loop. */
-    ed->arch.schedule_tail = continue_idle_task;
-    continue_idle_task(ed);
-}
-
 void startup_cpu_idle_loop(void)
 {
     struct exec_domain *ed = current;
 
-    /* Just some sanity to ensure that the scheduler is set up okay. */
-    ASSERT(ed->domain->domain_id == IDLE_DOMAIN_ID);
+    ASSERT(is_idle_task(ed->domain));
     percpu_ctxt[smp_processor_id()].curr_ed = ed;
     set_bit(smp_processor_id(), &ed->domain->cpuset);
-    domain_unpause_by_systemcontroller(ed->domain);
-
-    ed->arch.schedule_tail = __startup_cpu_idle_loop;
-    raise_softirq(SCHEDULE_SOFTIRQ);
-    do_softirq();
+    ed->arch.schedule_tail = continue_idle_task;
 
-    /* End up in __startup_cpu_idle_loop, not here. */
-    BUG();
+    idle_loop();
 }
 
 static long no_idt[2];
@@ -244,7 +231,7 @@ void arch_do_createdomain(struct exec_domain *ed)
 
     ed->arch.flags = TF_kernel_mode;
 
-    if ( d->domain_id == IDLE_DOMAIN_ID )
+    if ( is_idle_task(d) )
         return;
 
     ed->arch.schedule_tail = continue_nonidle_task;
diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c
index a8c66d6281..ebdbb622c8 100644
--- a/xen/arch/x86/domain_build.c
+++ b/xen/arch/x86/domain_build.c
@@ -438,7 +438,7 @@ int construct_dom0(struct domain *d,
     /* Mask all upcalls... */
     for ( i = 0; i < MAX_VIRT_CPUS; i++ )
         d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
-    d->shared_info->n_vcpu = smp_num_cpus;
+    d->shared_info->n_vcpu = num_online_cpus();
 
     /* Set up monitor table */
     update_pagetables(ed);
diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c
index 286313f6bb..71a8260453 100644
--- a/xen/arch/x86/io_apic.c
+++ b/xen/arch/x86/io_apic.c
@@ -2259,7 +2259,7 @@ int ioapic_guest_write(int apicid, int address, u32 val)
     
     pin = (address - 0x10) >> 1;
 
-    rte.dest.logical.logical_dest = target_cpus();
+    rte.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
     *(int *)&rte = val;
 
     if ( rte.vector >= FIRST_DEVICE_VECTOR )
diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index aaaff647ce..88807d2b3d 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -237,6 +237,7 @@ int pirq_guest_bind(struct exec_domain *ed, int irq, int will_share)
     irq_guest_action_t *action;
     unsigned long       flags;
     int                 rc = 0;
+    cpumask_t           cpumask = CPU_MASK_NONE;
 
     if ( !IS_CAPABLE_PHYSDEV(d) )
         return -EPERM;
@@ -273,9 +274,9 @@ int pirq_guest_bind(struct exec_domain *ed, int irq, int will_share)
         desc->handler->startup(irq);
 
         /* Attempt to bind the interrupt target to the correct CPU. */
+        cpu_set(ed->processor, cpumask);
         if ( desc->handler->set_affinity != NULL )
-            desc->handler->set_affinity(
-                irq, apicid_to_phys_cpu_present(ed->processor));
+            desc->handler->set_affinity(irq, cpumask);
     }
     else if ( !will_share || !action->shareable )
     {
diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c
index 4cbafae1e6..fcf4f94125 100644
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -86,7 +86,6 @@
 #define up(_m) spin_unlock(_m)
 #define vmalloc(_s) xmalloc_bytes(_s)
 #define vfree(_p) xfree(_p)
-#define num_online_cpus() smp_num_cpus
 
 #if 0
 MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
diff --git a/xen/arch/x86/mtrr/main.c b/xen/arch/x86/mtrr/main.c
index b6122d9d02..50c2f428b4 100644
--- a/xen/arch/x86/mtrr/main.c
+++ b/xen/arch/x86/mtrr/main.c
@@ -49,8 +49,6 @@
 #define down(_m) spin_lock(_m)
 #define up(_m) spin_unlock(_m)
 
-#define num_booting_cpus() smp_num_cpus
-
 u32 num_var_ranges = 0;
 
 unsigned int *usage_table;
diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c
index aef14645e4..94ec450d1b 100644
--- a/xen/arch/x86/nmi.c
+++ b/xen/arch/x86/nmi.c
@@ -92,13 +92,16 @@ int __init check_nmi_watchdog (void)
 
     printk("Testing NMI watchdog --- ");
 
-    for ( cpu = 0; cpu < smp_num_cpus; cpu++ ) 
+    for ( cpu = 0; cpu < NR_CPUS; cpu++ ) 
         prev_nmi_count[cpu] = nmi_count(cpu);
-    __sti();
+    local_irq_enable();
     mdelay((10*1000)/nmi_hz); /* wait 10 ticks */
 
-    for ( cpu = 0; cpu < smp_num_cpus; cpu++ ) 
+    for ( cpu = 0; cpu < NR_CPUS; cpu++ ) 
     {
+        if ( !cpu_isset(cpu, cpu_callin_map) && 
+             !cpu_isset(cpu, cpu_online_map) )
+            continue;
         if ( nmi_count(cpu) - prev_nmi_count[cpu] <= 5 )
             printk("CPU#%d stuck. ", cpu);
         else
@@ -277,13 +280,6 @@ void watchdog_enable(void)
     spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-void touch_nmi_watchdog (void)
-{
-    int i;
-    for (i = 0; i < smp_num_cpus; i++)
-        alert_counter[i] = 0;
-}
-
 void nmi_watchdog_tick (struct cpu_user_regs * regs)
 {
     int sum, cpu = smp_processor_id();
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index ee5c915d06..7fcadad2d7 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -33,6 +33,14 @@ integer_param("xenheap_megabytes", opt_xenheap_megabytes);
 int opt_noht = 0;
 boolean_param("noht", opt_noht);
 
+/* opt_nosmp: If true, secondary processors are ignored. */
+static int opt_nosmp = 0;
+boolean_param("nosmp", opt_nosmp);
+
+/* maxcpus: maximum number of CPUs to activate. */
+static unsigned int max_cpus = NR_CPUS;
+integer_param("maxcpus", max_cpus); 
+
 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
 static int opt_watchdog = 0;
 boolean_param("watchdog", opt_watchdog);
@@ -58,6 +66,9 @@ boolean_param("noapic", skip_ioapic_setup);
 
 int early_boot = 1;
 
+int ht_per_core = 1;
+cpumask_t cpu_present_map;
+
 /* Limits of Xen heap, used to initialise the allocator. */
 unsigned long xenheap_phys_start, xenheap_phys_end;
 
@@ -67,7 +78,6 @@ extern void trap_init(void);
 extern void time_init(void);
 extern void ac_timer_init(void);
 extern void initialize_keytable();
-extern int do_timer_lists_from_pit;
 
 extern unsigned long cpu0_stack[];
 
@@ -80,13 +90,10 @@ unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE;
 #endif
 EXPORT_SYMBOL(mmu_cr4_features);
 
-unsigned long wait_init_idle;
-
 struct exec_domain *idle_task[NR_CPUS] = { &idle0_exec_domain };
 
 int acpi_disabled;
 
-int phys_proc_id[NR_CPUS];
 int logical_proc_id[NR_CPUS];
 
 /* Standard macro to see if a specific flag is changeable. */
@@ -147,12 +154,11 @@ static void __init init_intel(struct cpuinfo_x86 *c)
     if ( c->x86 == 6 && c->x86_model < 3 && c->x86_mask < 3 )
         clear_bit(X86_FEATURE_SEP, &c->x86_capability);
 
-#ifdef CONFIG_SMP
     if ( test_bit(X86_FEATURE_HT, &c->x86_capability) )
     {
         u32     eax, ebx, ecx, edx;
         int     initial_apic_id, siblings, cpu = smp_processor_id();
-        
+
         cpuid(1, &eax, &ebx, &ecx, &edx);
         ht_per_core = siblings = (ebx & 0xff0000) >> 16;
 
@@ -176,7 +182,6 @@ static void __init init_intel(struct cpuinfo_x86 *c)
                    cpu, phys_proc_id[cpu], logical_proc_id[cpu]);
         }
     }
-#endif
 
 #ifdef CONFIG_VMX
     start_vmx();
@@ -292,6 +297,10 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
     }
 }
 
+void __init print_cpu_info(struct cpuinfo_x86 *c)
+{
+    printk("booted.\n");
+}
 
 unsigned long cpu_initialized;
 void __init cpu_init(void)
@@ -335,8 +344,6 @@ void __init cpu_init(void)
 
     /* Install correct page table. */
     write_ptbase(current);
-
-    init_idle_task();
 }
 
 int acpi_force;
@@ -383,6 +390,8 @@ static void __init do_initcalls(void)
 
 static void __init start_of_day(void)
 {
+    int i;
+
     /* Unmap the first page of CPU0's stack. */
     memguard_guard_stack(cpu0_stack);
 
@@ -421,8 +430,6 @@ static void __init start_of_day(void)
 
     init_apic_mappings();
 
-    scheduler_init();	
-
     init_IRQ();
 
     trap_init();
@@ -431,41 +438,41 @@ static void __init start_of_day(void)
 
     arch_init_memory();
 
-    smp_boot_cpus();
+    scheduler_init();	
+
+    if ( opt_nosmp )
+        max_cpus = 0;
+    smp_prepare_cpus(max_cpus);
 
-    __sti();
+    /* We aren't hotplug-capable yet. */
+    BUG_ON(!cpus_empty(cpu_present_map));
+    for_each_cpu ( i )
+        cpu_set(i, cpu_present_map);
 
     initialize_keytable();
 
     serial_init_stage2();
 
-    if ( !cpu_has_apic )
+    ac_timer_init();
+
+    init_xen_time();
+
+    for_each_present_cpu ( i )
     {
-        do_timer_lists_from_pit = 1;
-        if ( smp_num_cpus != 1 )
-            panic("We need local APICs on SMP machines!");
+        if ( num_online_cpus() >= max_cpus )
+            break;
+        if ( !cpu_online(i) )
+            __cpu_up(i);
     }
 
-    ac_timer_init();    /* init accurate timers */
-    init_xen_time();	/* initialise the time */
-    schedulers_start(); /* start scheduler for each CPU */
-
-    check_nmi_watchdog();
+    printk("Brought up %ld CPUs\n", (long)num_online_cpus());
+    smp_cpus_done(max_cpus);
 
     do_initcalls();
 
-    wait_init_idle = cpu_online_map;
-    clear_bit(smp_processor_id(), &wait_init_idle);
-    smp_threads_ready = 1;
-    smp_commence(); /* Tell other CPUs that state of the world is stable. */
-    while ( wait_init_idle != 0 )
-        cpu_relax();
+    schedulers_start();
 
     watchdog_enable();
-
-#ifdef CONFIG_X86_64 /* x86_32 uses low mappings when building DOM0. */
-    zap_low_mappings();
-#endif
 }
 
 #define EARLY_FAIL() for ( ; ; ) __asm__ __volatile__ ( "hlt" )
@@ -487,6 +494,8 @@ void __init __start_xen(multiboot_info_t *mbi)
     set_current(&idle0_exec_domain);
     set_processor_id(0);
 
+    smp_prepare_boot_cpu();
+
     /* We initialise the serial devices very early so we can get debugging. */
     serial_init_stage1();
 
@@ -695,8 +704,8 @@ void __init __start_xen(multiboot_info_t *mbi)
     /* Hide UART from DOM0 if we're using it */
     serial_endboot();
 
-    domain_unpause_by_systemcontroller(current->domain);
     domain_unpause_by_systemcontroller(dom0);
+
     startup_cpu_idle_loop();
 }
 
diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c
index 86ae84f116..83d7fc11b2 100644
--- a/xen/arch/x86/shadow.c
+++ b/xen/arch/x86/shadow.c
@@ -2525,7 +2525,7 @@ void __shadow_sync_all(struct domain *d)
     // page table page needs to be vcpu private).
     //
 #if 0 // this should be enabled for SMP guests...
-    flush_tlb_mask(((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id()));
+    flush_tlb_mask(((1<<num_online_cpus()) - 1) & ~(1<<smp_processor_id()));
 #endif
     need_flush = 1;
 
diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c
index 32641165a5..71b565e097 100644
--- a/xen/arch/x86/smp.c
+++ b/xen/arch/x86/smp.c
@@ -141,7 +141,7 @@ static inline void send_IPI_allbutself(int vector)
      * If there are no other CPUs in the system then we get an APIC send error 
      * if we try to broadcast. thus we have to avoid sending IPIs in this case.
      */
-    if ( smp_num_cpus <= 1 )
+    if ( num_online_cpus() <= 1 )
         return;
 
     __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
@@ -192,10 +192,10 @@ void new_tlbflush_clock_period(void)
     ASSERT(local_irq_is_enabled());
     
     /* Flush everyone else. We definitely flushed just before entry. */
-    if ( smp_num_cpus > 1 )
+    if ( num_online_cpus() > 1 )
     {
         spin_lock(&flush_lock);
-        flush_cpumask  = (1UL << smp_num_cpus) - 1;
+        flush_cpumask  = (1UL << num_online_cpus()) - 1;
         flush_cpumask &= ~(1UL << smp_processor_id());
         flush_va       = FLUSHVA_ALL;
         send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
@@ -257,7 +257,7 @@ int smp_call_function(
 
     ASSERT(local_irq_is_enabled());
 
-    cpuset = ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id());
+    cpuset = ((1UL << num_online_cpus()) - 1) & ~(1UL << smp_processor_id());
     if ( cpuset == 0 )
         return 0;
 
@@ -295,7 +295,6 @@ void smp_send_stop(void)
 {
     /* Stop all other CPUs in the system. */
     smp_call_function(stop_this_cpu, NULL, 1, 0);
-    smp_num_cpus = 1;
 
     local_irq_disable();
     disable_local_APIC();
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 4dcdf025c0..5b43462e50 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -17,7 +17,7 @@
  *	Fixes
  *		Felix Koop	:	NR_CPUS used properly
  *		Jose Renau	:	Handle single CPU case.
- *		Alan Cox	:	By repeated request 8) - Total BogoMIP report.
+ *		Alan Cox	:	By repeated request 8) - Total BogoMIPS report.
  *		Greg Wright	:	Fix for kernel stacks panic.
  *		Erich Boleyn	:	MP v1.4 and additional changes.
  *	Matthias Sattler	:	Changes for 2.1 kernel map.
@@ -30,52 +30,51 @@
  *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
  *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
  *		Martin J. Bligh	: 	Added support for multi-quad systems
- */
+ *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
+*		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */
 
 #include <xen/config.h>
 #include <xen/init.h>
-#include <xen/irq.h>
+#include <xen/kernel.h>
 #include <xen/mm.h>
-#include <xen/slab.h>
-#include <asm/flushtlb.h>
-#include <asm/mc146818rtc.h>
-#include <asm/smpboot.h>
-#include <xen/smp.h>
-#include <asm/msr.h>
-#include <asm/system.h>
-#include <asm/mpspec.h>
-#include <asm/io_apic.h>
 #include <xen/sched.h>
+#include <xen/irq.h>
 #include <xen/delay.h>
-#include <xen/lib.h>
+#include <asm/mc146818rtc.h>
+#include <asm/desc.h>
+#include <asm/div64.h>
+#include <asm/msr.h>
 #include <mach_apic.h>
 #include <mach_wakecpu.h>
+#include <smpboot_hooks.h>
 
-/* opt_nosmp: If true, secondary processors are ignored. */
-static int opt_nosmp = 0;
-boolean_param("nosmp", opt_nosmp);
-
-/* maxcpus: maximum number of CPUs to activate. */
-static int max_cpus = -1;
-integer_param("maxcpus", max_cpus); 
+static int _foo;
+#define set_kernel_exec(x,y) (_foo=0)
+#define alloc_bootmem_low_pages(x) __va(0x90000) /* trampoline address */
+int tainted;
+#define TAINT_UNSAFE_SMP 0
 
-/* Total count of live CPUs */
-int smp_num_cpus = 1;
+/* Set if we find a B stepping CPU */
+static int __initdata smp_b_stepping;
 
-/* Number of hyperthreads per core */
-int ht_per_core = 1;
+/* Number of siblings per CPU package */
+int smp_num_siblings = 1;
+int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */
+EXPORT_SYMBOL(phys_proc_id);
 
-/* Bitmask of currently online CPUs */
+/* bitmap of online cpus */
 cpumask_t cpu_online_map;
 
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
+static cpumask_t smp_commenced_mask;
 
 /* Per CPU bogomips and other parameters */
-struct cpuinfo_x86 cpu_data[NR_CPUS];
+struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 
-/* Set when the idlers are all forked */
-int smp_threads_ready;
+u8 x86_cpu_to_apicid[NR_CPUS] =
+			{ [0 ... NR_CPUS-1] = 0xff };
+EXPORT_SYMBOL(x86_cpu_to_apicid);
 
 /*
  * Trampoline 80x86 program as an array.
@@ -84,6 +83,7 @@ int smp_threads_ready;
 extern unsigned char trampoline_data [];
 extern unsigned char trampoline_end  [];
 static unsigned char *trampoline_base;
+static int trampoline_exec;
 
 /*
  * Currently trivial. Write the real->protected mode
@@ -93,8 +93,8 @@ static unsigned char *trampoline_base;
 
 static unsigned long __init setup_trampoline(void)
 {
-    memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
-    return virt_to_phys(trampoline_base);
+	memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
+	return virt_to_phys(trampoline_base);
 }
 
 /*
@@ -103,11 +103,17 @@ static unsigned long __init setup_trampoline(void)
  */
 void __init smp_alloc_memory(void)
 {
-    /*
-     * Has to be in very low memory so we can execute
-     * real-mode AP code.
-     */
-    trampoline_base = __va(0x90000);
+	trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
+	/*
+	 * Has to be in very low memory so we can execute
+	 * real-mode AP code.
+	 */
+	if (__pa(trampoline_base) >= 0x9F000)
+		BUG();
+	/*
+	 * Make the SMP trampoline executable:
+	 */
+	trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
 }
 
 /*
@@ -115,40 +121,63 @@ void __init smp_alloc_memory(void)
  * a given CPU
  */
 
-void __init smp_store_cpu_info(int id)
+static void __init smp_store_cpu_info(int id)
 {
-    cpu_data[id] = boot_cpu_data;
-    if (id != 0)
-        identify_cpu(&cpu_data[id]);
-}
-
-/*
- * Architecture specific routine called by the kernel just before init is
- * fired off. This allows the BP to have everything in order [we hope].
- * At the end of this all the APs will hit the system scheduling and off
- * we go. Each AP will load the system gdt's and jump through the kernel
- * init into idle(). At this point the scheduler will one day take over
- * and give them jobs to do. smp_callin is a standard routine
- * we use to track CPUs as they power up.
- */
-
-static atomic_t smp_commenced = ATOMIC_INIT(0);
-
-void __init smp_commence(void)
-{
-    /*
-     * Lets the callins below out of their loop.
-     */
-    Dprintk("Setting commenced=1, go go go\n");
-
-    wmb();
-    atomic_set(&smp_commenced,1);
+	struct cpuinfo_x86 *c = cpu_data + id;
+
+	*c = boot_cpu_data;
+	if (id!=0)
+		identify_cpu(c);
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86_vendor == X86_VENDOR_INTEL &&
+	    c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3)
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		smp_b_stepping = 1;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
+
+		/* Athlon 660/661 is valid. */	
+		if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
+			goto valid_k7;
+
+		/* Duron 670 is valid */
+		if ((c->x86_model==7) && (c->x86_mask==0))
+			goto valid_k7;
+
+		/*
+		 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
+		 * It's worth noting that the A5 stepping (662) of some Athlon XP's
+		 * have the MP bit set.
+		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
+		 */
+		if (((c->x86_model==6) && (c->x86_mask>=2)) ||
+		    ((c->x86_model==7) && (c->x86_mask>=1)) ||
+		     (c->x86_model> 7))
+			if (cpu_has_mp)
+				goto valid_k7;
+
+		/* If we get here, it's not a certified SMP capable AMD system. */
+		tainted |= TAINT_UNSAFE_SMP;
+	}
+
+valid_k7:
+	;
 }
 
 /*
  * TSC synchronization.
  *
- * We first check wether all CPUs have their TSC's synchronized,
+ * We first check whether all CPUs have their TSC's synchronized,
  * then we print a warning if not, and always resync.
  */
 
@@ -159,616 +188,724 @@ static unsigned long long tsc_values[NR_CPUS];
 
 #define NR_LOOPS 5
 
-/*
- * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit
- * multiplication. Not terribly optimized but we need it at boot time only
- * anyway.
- *
- * result == a / b
- *	== (a1 + a2*(2^32)) / b
- *	== a1/b + a2*(2^32/b)
- *	== a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b
- *		    ^---- (this multiplication can overflow)
- */
-
-static unsigned long long div64 (unsigned long long a, unsigned long b0)
-{
-    unsigned int a1, a2;
-    unsigned long long res;
-
-    a1 = ((unsigned int*)&a)[0];
-    a2 = ((unsigned int*)&a)[1];
-
-    res = a1/b0 +
-        (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) +
-        a2 / b0 +
-        (a2 * (0xffffffff % b0)) / b0;
-
-    return res;
-}
-
 static void __init synchronize_tsc_bp (void)
 {
-    int i;
-    unsigned long long t0;
-    unsigned long long sum, avg;
-    long long delta;
-    int buggy = 0;
-
-    printk("checking TSC synchronization across CPUs: ");
-
-    atomic_set(&tsc_start_flag, 1);
-    wmb();
-
-    /*
-     * We loop a few times to get a primed instruction cache,
-     * then the last pass is more or less synchronized and
-     * the BP and APs set their cycle counters to zero all at
-     * once. This reduces the chance of having random offsets
-     * between the processors, and guarantees that the maximum
-     * delay between the cycle counters is never bigger than
-     * the latency of information-passing (cachelines) between
-     * two CPUs.
-     */
-    for (i = 0; i < NR_LOOPS; i++) {
-        /*
-         * all APs synchronize but they loop on '== num_cpus'
-         */
-        while (atomic_read(&tsc_count_start) != smp_num_cpus-1) mb();
-        atomic_set(&tsc_count_stop, 0);
-        wmb();
-        /*
-         * this lets the APs save their current TSC:
-         */
-        atomic_inc(&tsc_count_start);
-
-        rdtscll(tsc_values[smp_processor_id()]);
-        /*
-         * We clear the TSC in the last loop:
-         */
-        if (i == NR_LOOPS-1)
-            write_tsc(0, 0);
-
-        /*
-         * Wait for all APs to leave the synchronization point:
-         */
-        while (atomic_read(&tsc_count_stop) != smp_num_cpus-1) mb();
-        atomic_set(&tsc_count_start, 0);
-        wmb();
-        atomic_inc(&tsc_count_stop);
-    }
-
-    sum = 0;
-    for (i = 0; i < smp_num_cpus; i++) {
-        t0 = tsc_values[i];
-        sum += t0;
-    }
-    avg = div64(sum, smp_num_cpus);
-
-    sum = 0;
-    for (i = 0; i < smp_num_cpus; i++) {
-        delta = tsc_values[i] - avg;
-        if (delta < 0)
-            delta = -delta;
-        /*
-         * We report bigger than 2 microseconds clock differences.
-         */
-        if (delta > 2*ticks_per_usec) {
-            long realdelta;
-            if (!buggy) {
-                buggy = 1;
-                printk("\n");
-            }
-            realdelta = div64(delta, ticks_per_usec);
-            if (tsc_values[i] < avg)
-                realdelta = -realdelta;
-
-            printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n",
-                   i, realdelta);
-        }
-
-        sum += delta;
-    }
-    if (!buggy)
-        printk("passed.\n");
+	int i;
+	unsigned long long t0;
+	unsigned long long sum, avg;
+	long long delta;
+	unsigned long one_usec;
+	int buggy = 0;
+
+	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
+
+	/* convert from kcyc/sec to cyc/usec */
+	one_usec = cpu_khz / 1000;
+
+	atomic_set(&tsc_start_flag, 1);
+	wmb();
+
+	/*
+	 * We loop a few times to get a primed instruction cache,
+	 * then the last pass is more or less synchronized and
+	 * the BP and APs set their cycle counters to zero all at
+	 * once. This reduces the chance of having random offsets
+	 * between the processors, and guarantees that the maximum
+	 * delay between the cycle counters is never bigger than
+	 * the latency of information-passing (cachelines) between
+	 * two CPUs.
+	 */
+	for (i = 0; i < NR_LOOPS; i++) {
+		/*
+		 * all APs synchronize but they loop on '== num_cpus'
+		 */
+		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
+			mb();
+		atomic_set(&tsc_count_stop, 0);
+		wmb();
+		/*
+		 * this lets the APs save their current TSC:
+		 */
+		atomic_inc(&tsc_count_start);
+
+		rdtscll(tsc_values[smp_processor_id()]);
+		/*
+		 * We clear the TSC in the last loop:
+		 */
+		if (i == NR_LOOPS-1)
+			write_tsc(0, 0);
+
+		/*
+		 * Wait for all APs to leave the synchronization point:
+		 */
+		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
+			mb();
+		atomic_set(&tsc_count_start, 0);
+		wmb();
+		atomic_inc(&tsc_count_stop);
+	}
+
+	sum = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_isset(i, cpu_callout_map)) {
+			t0 = tsc_values[i];
+			sum += t0;
+		}
+	}
+	avg = sum;
+	do_div(avg, num_booting_cpus());
+
+	sum = 0;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_isset(i, cpu_callout_map))
+			continue;
+		delta = tsc_values[i] - avg;
+		if (delta < 0)
+			delta = -delta;
+		/*
+		 * We report bigger than 2 microseconds clock differences.
+		 */
+		if (delta > 2*one_usec) {
+			long realdelta;
+			if (!buggy) {
+				buggy = 1;
+				printk("\n");
+			}
+			realdelta = delta;
+			do_div(realdelta, one_usec);
+			if (tsc_values[i] < avg)
+				realdelta = -realdelta;
+
+			printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
+		}
+
+		sum += delta;
+	}
+	if (!buggy)
+		printk("passed.\n");
 }
 
 static void __init synchronize_tsc_ap (void)
 {
-    int i;
-
-    /*
-     * smp_num_cpus is not necessarily known at the time
-     * this gets called, so we first wait for the BP to
-     * finish SMP initialization:
-     */
-    while (!atomic_read(&tsc_start_flag)) mb();
-
-    for (i = 0; i < NR_LOOPS; i++) {
-        atomic_inc(&tsc_count_start);
-        while (atomic_read(&tsc_count_start) != smp_num_cpus) mb();
-
-        rdtscll(tsc_values[smp_processor_id()]);
-        if (i == NR_LOOPS-1)
-            write_tsc(0, 0);
-
-        atomic_inc(&tsc_count_stop);
-        while (atomic_read(&tsc_count_stop) != smp_num_cpus) mb();
-    }
+	int i;
+
+	/*
+	 * Not every cpu is online at the time
+	 * this gets called, so we first wait for the BP to
+	 * finish SMP initialization:
+	 */
+	while (!atomic_read(&tsc_start_flag)) mb();
+
+	for (i = 0; i < NR_LOOPS; i++) {
+		atomic_inc(&tsc_count_start);
+		while (atomic_read(&tsc_count_start) != num_booting_cpus())
+			mb();
+
+		rdtscll(tsc_values[smp_processor_id()]);
+		if (i == NR_LOOPS-1)
+			write_tsc(0, 0);
+
+		atomic_inc(&tsc_count_stop);
+		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+	}
 }
 #undef NR_LOOPS
 
+extern void calibrate_delay(void);
+
 static atomic_t init_deasserted;
 
 void __init smp_callin(void)
 {
-    int cpuid, phys_id, i;
-
-    /*
-     * If waken up by an INIT in an 82489DX configuration
-     * we may get here before an INIT-deassert IPI reaches
-     * our local APIC.  We have to wait for the IPI or we'll
-     * lock up on an APIC access.
-     */
-    while (!atomic_read(&init_deasserted));
-
-    /*
-     * (This works even if the APIC is not enabled.)
-     */
-    phys_id = GET_APIC_ID(apic_read(APIC_ID));
-    cpuid = smp_processor_id();
-    if (test_and_set_bit(cpuid, &cpu_online_map)) {
-        printk("huh, phys CPU#%d, CPU#%d already present??\n",
-               phys_id, cpuid);
-        BUG();
-    }
-    Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
-    /*
-     * STARTUP IPIs are fragile beasts as they might sometimes
-     * trigger some glue motherboard logic. Complete APIC bus
-     * silence for 1 second, this overestimates the time the
-     * boot CPU is spending to send the up to 2 STARTUP IPIs
-     * by a factor of two. This should be enough.
-     */
-
-    for ( i = 0; i < 200; i++ )
-    {
-        if ( test_bit(cpuid, &cpu_callout_map) ) break;
-        mdelay(10);
-    }
-
-    if (!test_bit(cpuid, &cpu_callout_map)) {
-        printk("BUG: CPU%d started up but did not get a callout!\n",
-               cpuid);
-        BUG();
-    }
-
-    /*
-     * the boot CPU has finished the init stage and is spinning
-     * on callin_map until we finish. We are free to set up this
-     * CPU, first the APIC. (this is probably redundant on most
-     * boards)
-     */
-
-    Dprintk("CALLIN, before setup_local_APIC().\n");
-
-    setup_local_APIC();
-
-    __sti();
-
-    Dprintk("Stack at about %p\n",&cpuid);
-
-    /*
-     * Save our processor parameters
-     */
-    smp_store_cpu_info(cpuid);
-
-    /*
-     * Allow the master to continue.
-     */
-    set_bit(cpuid, &cpu_callin_map);
-
-    /*
-     *      Synchronize the TSC with the BP
-     */
-    synchronize_tsc_ap();
+	int cpuid, phys_id, i;
+
+	/*
+	 * If waken up by an INIT in an 82489DX configuration
+	 * we may get here before an INIT-deassert IPI reaches
+	 * our local APIC.  We have to wait for the IPI or we'll
+	 * lock up on an APIC access.
+	 */
+	wait_for_init_deassert(&init_deasserted);
+
+	/*
+	 * (This works even if the APIC is not enabled.)
+	 */
+	phys_id = GET_APIC_ID(apic_read(APIC_ID));
+	cpuid = smp_processor_id();
+	if (cpu_isset(cpuid, cpu_callin_map)) {
+		printk("huh, phys CPU#%d, CPU#%d already present??\n",
+					phys_id, cpuid);
+		BUG();
+	}
+	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+
+	/*
+	 * STARTUP IPIs are fragile beasts as they might sometimes
+	 * trigger some glue motherboard logic. Complete APIC bus
+	 * silence for 1 second, this overestimates the time the
+	 * boot CPU is spending to send the up to 2 STARTUP IPIs
+	 * by a factor of two. This should be enough.
+	 */
+
+	/*
+	 * Waiting 2s total for startup
+	 */
+	for (i = 0; i < 200; i++) {
+		/*
+		 * Has the boot CPU finished it's STARTUP sequence?
+		 */
+		if (cpu_isset(cpuid, cpu_callout_map))
+			break;
+		rep_nop();
+		mdelay(10);
+	}
+
+	if (!cpu_isset(cpuid, cpu_callout_map)) {
+		printk("BUG: CPU%d started up but did not get a callout!\n",
+			cpuid);
+		BUG();
+	}
+
+	/*
+	 * the boot CPU has finished the init stage and is spinning
+	 * on callin_map until we finish. We are free to set up this
+	 * CPU, first the APIC. (this is probably redundant on most
+	 * boards)
+	 */
+
+	Dprintk("CALLIN, before setup_local_APIC().\n");
+	smp_callin_clear_local_apic();
+	setup_local_APIC();
+	map_cpu_to_logical_apicid();
+
+#if 0
+	/*
+	 * Get our bogomips.
+	 */
+	calibrate_delay();
+	Dprintk("Stack at about %p\n",&cpuid);
+#endif
+
+	/*
+	 * Save our processor parameters
+	 */
+ 	smp_store_cpu_info(cpuid);
+
+	disable_APIC_timer();
+
+	/*
+	 * Allow the master to continue.
+	 */
+	cpu_set(cpuid, cpu_callin_map);
+
+	/*
+	 *      Synchronize the TSC with the BP
+	 */
+	if (cpu_has_tsc && cpu_khz)
+		synchronize_tsc_ap();
 }
 
-static int cpucount;
+int cpucount;
 
-#ifdef __i386__
+#ifdef CONFIG_X86_32
 static void construct_percpu_idt(unsigned int cpu)
 {
-    unsigned char idt_load[10];
+	unsigned char idt_load[10];
 
-    idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
-    memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t));
+	idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES);
+	memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t));
 
-    *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
-    *(unsigned long  *)(&idt_load[2]) = (unsigned long)idt_tables[cpu];
-    __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) );
+	*(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1;
+	*(unsigned long  *)(&idt_load[2]) = (unsigned long)idt_tables[cpu];
+	__asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) );
 }
 #endif
 
 /*
  * Activate a secondary processor.
  */
-void __init start_secondary(void)
+void __init start_secondary(void *unused)
 {
-    unsigned int cpu = cpucount;
-
-    extern void percpu_traps_init(void);
-    extern void cpu_init(void);
-
-    set_current(idle_task[cpu]);
-    set_processor_id(cpu);
+	unsigned int cpu = cpucount;
 
-    percpu_traps_init();
+	extern void percpu_traps_init(void);
+	extern void cpu_init(void);
 
-    cpu_init();
+	set_current(idle_task[cpu]);
+	set_processor_id(cpu);
 
-    smp_callin();
+	percpu_traps_init();
 
-    while (!atomic_read(&smp_commenced))
-        cpu_relax();
+	cpu_init();
+	smp_callin();
+	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
+		rep_nop();
 
-#ifdef __i386__
-    /*
-     * At this point, boot CPU has fully initialised the IDT. It is
-     * now safe to make ourselves a private copy.
-     */
-    construct_percpu_idt(cpu);
+#ifdef CONFIG_X86_32
+	/*
+	 * At this point, boot CPU has fully initialised the IDT. It is
+	 * now safe to make ourselves a private copy.
+	 */
+	construct_percpu_idt(cpu);
 #endif
 
-    local_flush_tlb();
+	setup_secondary_APIC_clock();
+	enable_APIC_timer();
 
-    startup_cpu_idle_loop();
+	/*
+	 * low-memory mappings have been cleared, flush them from
+	 * the local TLBs too.
+	 */
+	local_flush_tlb();
+	cpu_set(smp_processor_id(), cpu_online_map);
 
-    BUG();
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+
+	wmb();
+	startup_cpu_idle_loop();
 }
 
 extern struct {
-    unsigned long esp, ss;
+	void * esp;
+	unsigned short ss;
 } stack_start;
 
-/* which physical APIC ID maps to which logical CPU number */
-volatile int physical_apicid_2_cpu[MAX_APICID];
-/* which logical CPU number maps to which physical APIC ID */
-volatile int cpu_2_physical_apicid[NR_CPUS];
+#ifdef CONFIG_NUMA
 
-/* which logical APIC ID maps to which logical CPU number */
-volatile int logical_apicid_2_cpu[MAX_APICID];
-/* which logical CPU number maps to which logical APIC ID */
-volatile int cpu_2_logical_apicid[NR_CPUS];
+/* which logical CPUs are on which nodes */
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
+				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+/* which node each logical CPU is on */
+int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_2_node);
 
-static inline void init_cpu_to_apicid(void)
-/* Initialize all maps between cpu number and apicids */
+/* set up a mapping between cpu and node. */
+static inline void map_cpu_to_node(int cpu, int node)
 {
-    int apicid, cpu;
-
-    for (apicid = 0; apicid < MAX_APICID; apicid++) {
-        physical_apicid_2_cpu[apicid] = -1;
-        logical_apicid_2_cpu[apicid] = -1;
-    }
-    for (cpu = 0; cpu < NR_CPUS; cpu++) {
-        cpu_2_physical_apicid[cpu] = -1;
-        cpu_2_logical_apicid[cpu] = -1;
-    }
+	printk("Mapping cpu %d to node %d\n", cpu, node);
+	cpu_set(cpu, node_2_cpu_mask[node]);
+	cpu_2_node[cpu] = node;
 }
 
-static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * set up a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
- */
+/* undo a mapping between cpu and node. */
+static inline void unmap_cpu_to_node(int cpu)
 {
-    physical_apicid_2_cpu[apicid] = cpu;	
-    cpu_2_physical_apicid[cpu] = apicid;
+	int node;
+
+	printk("Unmapping cpu %d from all nodes\n", cpu);
+	for (node = 0; node < MAX_NUMNODES; node ++)
+		cpu_clear(cpu, node_2_cpu_mask[node]);
+	cpu_2_node[cpu] = 0;
 }
+#else /* !CONFIG_NUMA */
 
-static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
-/* 
- * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
- */
+#define map_cpu_to_node(cpu, node)	({})
+#define unmap_cpu_to_node(cpu)	({})
+
+#endif /* CONFIG_NUMA */
+
+u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+void map_cpu_to_logical_apicid(void)
+{
+	int cpu = smp_processor_id();
+	int apicid = logical_smp_processor_id();
+
+	cpu_2_logical_apicid[cpu] = apicid;
+	map_cpu_to_node(cpu, apicid_to_node(apicid));
+}
+
+void unmap_cpu_to_logical_apicid(int cpu)
 {
-    physical_apicid_2_cpu[apicid] = -1;	
-    cpu_2_physical_apicid[cpu] = -1;
+	cpu_2_logical_apicid[cpu] = BAD_APICID;
+	unmap_cpu_to_node(cpu);
 }
 
 #if APIC_DEBUG
-static inline void inquire_remote_apic(int apicid)
+static inline void __inquire_remote_apic(int apicid)
 {
-    int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-    char *names[] = { "ID", "VERSION", "SPIV" };
-    int timeout, status;
-
-    printk("Inquiring remote APIC #%d...\n", apicid);
-
-    for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
-        printk("... APIC #%d %s: ", apicid, names[i]);
-
-        /*
-         * Wait for idle.
-         */
-        apic_wait_icr_idle();
-
-        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-        apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
-
-        timeout = 0;
-        do {
-            udelay(100);
-            status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
-        } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
-        switch (status) {
-        case APIC_ICR_RR_VALID:
-            status = apic_read(APIC_RRR);
-            printk("%08x\n", status);
-            break;
-        default:
-            printk("failed\n");
-        }
-    }
+	int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+	char *names[] = { "ID", "VERSION", "SPIV" };
+	int timeout, status;
+
+	printk("Inquiring remote APIC #%d...\n", apicid);
+
+	for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) {
+		printk("... APIC #%d %s: ", apicid, names[i]);
+
+		/*
+		 * Wait for idle.
+		 */
+		apic_wait_icr_idle();
+
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
+		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+
+		timeout = 0;
+		do {
+			udelay(100);
+			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
+		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
+
+		switch (status) {
+		case APIC_ICR_RR_VALID:
+			status = apic_read(APIC_RRR);
+			printk("%08x\n", status);
+			break;
+		default:
+			printk("failed\n");
+		}
+	}
 }
 #endif
 
+#ifdef WAKE_SECONDARY_VIA_NMI
+/* 
+ * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
+ * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
+ * won't ... remember to clear down the APIC, etc later.
+ */
+static int __init
+wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
+{
+	unsigned long send_status = 0, accept_status = 0;
+	int timeout, maxlvt;
+
+	/* Target chip */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
+
+	/* Boot on the stack */
+	/* Kick the second */
+	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+
+	Dprintk("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		Dprintk("+");
+		udelay(100);
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	/*
+	 * Give the other CPU some time to accept the IPI.
+	 */
+	udelay(200);
+	/*
+	 * Due to the Pentium erratum 3AP.
+	 */
+	maxlvt = get_maxlvt();
+	if (maxlvt > 3) {
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+	}
+	accept_status = (apic_read(APIC_ESR) & 0xEF);
+	Dprintk("NMI sent.\n");
+
+	if (send_status)
+		printk("APIC never delivered???\n");
+	if (accept_status)
+		printk("APIC delivery error (%lx).\n", accept_status);
+
+	return (send_status | accept_status);
+}
+#endif	/* WAKE_SECONDARY_VIA_NMI */
 
-static int wakeup_secondary_via_INIT(int phys_apicid, unsigned long start_eip)
+#ifdef WAKE_SECONDARY_VIA_INIT
+static int __init
+wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 {
-    unsigned long send_status = 0, accept_status = 0;
-    int maxlvt, timeout, num_starts, j;
-
-    Dprintk("Asserting INIT.\n");
-
-    /*
-     * Turn INIT on target chip
-     */
-    apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-    /*
-     * Send IPI
-     */
-    apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-                      | APIC_DM_INIT);
-
-    Dprintk("Waiting for send to finish...\n");
-    timeout = 0;
-    do {
-        Dprintk("+");
-        udelay(100);
-        send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-    } while (send_status && (timeout++ < 1000));
-
-    mdelay(10);
-
-    Dprintk("Deasserting INIT.\n");
-
-    /* Target chip */
-    apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-    /* Send IPI */
-    apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
-
-    Dprintk("Waiting for send to finish...\n");
-    timeout = 0;
-    do {
-        Dprintk("+");
-        udelay(100);
-        send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-    } while (send_status && (timeout++ < 1000));
-
-    atomic_set(&init_deasserted, 1);
-
-    /*
-     * Should we send STARTUP IPIs ?
-     *
-     * Determine this based on the APIC version.
-     * If we don't have an integrated APIC, don't send the STARTUP IPIs.
-     */
-    if (APIC_INTEGRATED(apic_version[phys_apicid]))
-        num_starts = 2;
-    else
-        num_starts = 0;
-
-    /*
-     * Run STARTUP IPI loop.
-     */
-    Dprintk("#startup loops: %d.\n", num_starts);
-
-    maxlvt = get_maxlvt();
-
-    for (j = 1; j <= num_starts; j++) {
-        Dprintk("Sending STARTUP #%d.\n",j);
-
-        apic_read_around(APIC_SPIV);
-        apic_write(APIC_ESR, 0);
-        apic_read(APIC_ESR);
-        Dprintk("After apic_write.\n");
-
-        /*
-         * STARTUP IPI
-         */
-
-        /* Target chip */
-        apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
-        /* Boot on the stack */
-        /* Kick the second */
-        apic_write_around(APIC_ICR, APIC_DM_STARTUP
-                          | (start_eip >> 12));
-
-        /*
-         * Give the other CPU some time to accept the IPI.
-         */
-        udelay(300);
-
-        Dprintk("Startup point 1.\n");
-
-        Dprintk("Waiting for send to finish...\n");
-        timeout = 0;
-        do {
-            Dprintk("+");
-            udelay(100);
-            send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-        } while (send_status && (timeout++ < 1000));
-
-        /*
-         * Give the other CPU some time to accept the IPI.
-         */
-        udelay(200);
-        /*
-         * Due to the Pentium erratum 3AP.
-         */
-        if (maxlvt > 3) {
-            apic_read_around(APIC_SPIV);
-            apic_write(APIC_ESR, 0);
-        }
-        accept_status = (apic_read(APIC_ESR) & 0xEF);
-        if (send_status || accept_status)
-            break;
-    }
-    Dprintk("After Startup.\n");
-
-    if (send_status)
-        printk("APIC never delivered???\n");
-    if (accept_status)
-        printk("APIC delivery error (%lx).\n", accept_status);
-
-    return (send_status | accept_status);
+	unsigned long send_status = 0, accept_status = 0;
+	int maxlvt, timeout, num_starts, j;
+
+	/*
+	 * Be paranoid about clearing APIC errors.
+	 */
+	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+	}
+
+	Dprintk("Asserting INIT.\n");
+
+	/*
+	 * Turn INIT on target chip
+	 */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/*
+	 * Send IPI
+	 */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
+				| APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		Dprintk("+");
+		udelay(100);
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	mdelay(10);
+
+	Dprintk("Deasserting INIT.\n");
+
+	/* Target chip */
+	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+	/* Send IPI */
+	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+
+	Dprintk("Waiting for send to finish...\n");
+	timeout = 0;
+	do {
+		Dprintk("+");
+		udelay(100);
+		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+	} while (send_status && (timeout++ < 1000));
+
+	atomic_set(&init_deasserted, 1);
+
+	/*
+	 * Should we send STARTUP IPIs ?
+	 *
+	 * Determine this based on the APIC version.
+	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
+	 */
+	if (APIC_INTEGRATED(apic_version[phys_apicid]))
+		num_starts = 2;
+	else
+		num_starts = 0;
+
+	/*
+	 * Run STARTUP IPI loop.
+	 */
+	Dprintk("#startup loops: %d.\n", num_starts);
+
+	maxlvt = get_maxlvt();
+
+	for (j = 1; j <= num_starts; j++) {
+		Dprintk("Sending STARTUP #%d.\n",j);
+		apic_read_around(APIC_SPIV);
+		apic_write(APIC_ESR, 0);
+		apic_read(APIC_ESR);
+		Dprintk("After apic_write.\n");
+
+		/*
+		 * STARTUP IPI
+		 */
+
+		/* Target chip */
+		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
+
+		/* Boot on the stack */
+		/* Kick the second */
+		apic_write_around(APIC_ICR, APIC_DM_STARTUP
+					| (start_eip >> 12));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(300);
+
+		Dprintk("Startup point 1.\n");
+
+		Dprintk("Waiting for send to finish...\n");
+		timeout = 0;
+		do {
+			Dprintk("+");
+			udelay(100);
+			send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
+		} while (send_status && (timeout++ < 1000));
+
+		/*
+		 * Give the other CPU some time to accept the IPI.
+		 */
+		udelay(200);
+		/*
+		 * Due to the Pentium erratum 3AP.
+		 */
+		if (maxlvt > 3) {
+			apic_read_around(APIC_SPIV);
+			apic_write(APIC_ESR, 0);
+		}
+		accept_status = (apic_read(APIC_ESR) & 0xEF);
+		if (send_status || accept_status)
+			break;
+	}
+	Dprintk("After Startup.\n");
+
+	if (send_status)
+		printk("APIC never delivered???\n");
+	if (accept_status)
+		printk("APIC delivery error (%lx).\n", accept_status);
+
+	return (send_status | accept_status);
 }
+#endif	/* WAKE_SECONDARY_VIA_INIT */
 
-extern unsigned long cpu_initialized;
+extern cpumask_t cpu_initialized;
 
-static void __init do_boot_cpu (int apicid) 
+static int __init do_boot_cpu(int apicid)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
+ * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
  */
 {
-    struct domain *idle;
-    struct exec_domain *ed;
-    unsigned long boot_error = 0;
-    int timeout, cpu;
-    unsigned long start_eip;
-    void *stack;
-
-    cpu = ++cpucount;
+	struct domain *idle;
+	struct exec_domain *ed;
+	void *stack;
+	unsigned long boot_error;
+	int timeout, cpu;
+	unsigned long start_eip;
+	unsigned short nmi_high = 0, nmi_low = 0;
 
-    if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL )
-        panic("failed 'createdomain' for CPU %d", cpu);
+	cpu = ++cpucount;
 
-    ed = idle->exec_domain[0];
+	if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL )
+		panic("failed 'createdomain' for CPU %d", cpu);
 
-    set_bit(_DOMF_idle_domain, &idle->domain_flags);
+	ed = idle_task[cpu] = idle->exec_domain[0];
 
-    ed->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
+	set_bit(_DOMF_idle_domain, &idle->domain_flags);
 
-    map_cpu_to_boot_apicid(cpu, apicid);
+	ed->arch.monitor_table = mk_pagetable(__pa(idle_pg_table));
 
-    idle_task[cpu] = ed;
+	/* start_eip had better be page-aligned! */
+	start_eip = setup_trampoline();
 
-    /* start_eip had better be page-aligned! */
-    start_eip = setup_trampoline();
+	/* So we see what's up   */
+	printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
 
-    /* So we see what's up. */
-    printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
-
-    stack = (void *)alloc_xenheap_pages(STACK_ORDER);
+	stack = (void *)alloc_xenheap_pages(STACK_ORDER);
 #if defined(__i386__)
-    stack_start.esp = __pa(stack);
+	stack_start.esp = (void *)__pa(stack);
 #elif defined(__x86_64__)
-    stack_start.esp = (unsigned long)stack;
+	stack_start.esp = stack;
 #endif
-    stack_start.esp += STACK_SIZE - sizeof(struct cpu_info);
-
-    /* Debug build: detect stack overflow by setting up a guard page. */
-    memguard_guard_stack(stack);
-
-    /*
-     * This grunge runs the startup process for
-     * the targeted processor.
-     */
-
-    atomic_set(&init_deasserted, 0);
-
-    Dprintk("Setting warm reset code and vector.\n");
-
-    CMOS_WRITE(0xa, 0xf);
-    local_flush_tlb();
-    Dprintk("1.\n");
-    *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
-    Dprintk("2.\n");
-    *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
-    Dprintk("3.\n");
-
-    /*
-     * Be paranoid about clearing APIC errors.
-     */
-    if ( APIC_INTEGRATED(apic_version[apicid]) )
-    {
-        apic_read_around(APIC_SPIV);
-        apic_write(APIC_ESR, 0);
-        apic_read(APIC_ESR);
-    }
-
-    /*
-     * Status is now clean
-     */
-    boot_error = 0;
-
-    /*
-     * Starting actual IPI sequence...
-     */
-
-    boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
-
-    if (!boot_error) {
-        /*
-         * allow APs to start initializing.
-         */
-        Dprintk("Before Callout %d.\n", cpu);
-        set_bit(cpu, &cpu_callout_map);
-        Dprintk("After Callout %d.\n", cpu);
-
-        /*
-         * Wait 5s total for a response
-         */
-        for (timeout = 0; timeout < 50000; timeout++) {
-            if (test_bit(cpu, &cpu_callin_map))
-                break;	/* It has booted */
-            udelay(100);
-        }
-
-        if (test_bit(cpu, &cpu_callin_map)) {
-            /* number CPUs logically, starting from 1 (BSP is 0) */
-            printk("CPU%d has booted.\n", cpu);
-        } else {
-            boot_error= 1;
-            if (*((volatile unsigned int *)phys_to_virt(start_eip))
-                == 0xA5A5A5A5)
+	stack_start.esp += STACK_SIZE - sizeof(struct cpu_info);
+
+	/* Debug build: detect stack overflow by setting up a guard page. */
+	memguard_guard_stack(stack);
+
+	/*
+	 * This grunge runs the startup process for
+	 * the targeted processor.
+	 */
+
+	atomic_set(&init_deasserted, 0);
+
+	Dprintk("Setting warm reset code and vector.\n");
+
+	store_NMI_vector(&nmi_high, &nmi_low);
+
+	smpboot_setup_warm_reset_vector(start_eip);
+
+	/*
+	 * Starting actual IPI sequence...
+	 */
+	boot_error = wakeup_secondary_cpu(apicid, start_eip);
+
+	if (!boot_error) {
+		/*
+		 * allow APs to start initializing.
+		 */
+		Dprintk("Before Callout %d.\n", cpu);
+		cpu_set(cpu, cpu_callout_map);
+		Dprintk("After Callout %d.\n", cpu);
+
+		/*
+		 * Wait 5s total for a response
+		 */
+		for (timeout = 0; timeout < 50000; timeout++) {
+			if (cpu_isset(cpu, cpu_callin_map))
+				break;	/* It has booted */
+			udelay(100);
+		}
+
+		if (cpu_isset(cpu, cpu_callin_map)) {
+			/* number CPUs logically, starting from 1 (BSP is 0) */
+			Dprintk("OK.\n");
+			printk("CPU%d: ", cpu);
+			print_cpu_info(&cpu_data[cpu]);
+			Dprintk("CPU has booted.\n");
+		} else {
+			boot_error= 1;
+			if (*((volatile unsigned char *)trampoline_base)
+					== 0xA5)
 				/* trampoline started but...? */
-                printk("Stuck ??\n");
-            else
+				printk("Stuck ??\n");
+			else
 				/* trampoline code not run */
-                printk("Not responding.\n");
-#if APIC_DEBUG
-            inquire_remote_apic(apicid);
-#endif
-        }
-    }
-    if (boot_error) {
-        /* Try to put things back the way they were before ... */
-        unmap_cpu_to_boot_apicid(cpu, apicid);
-        clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
-        clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
-        clear_bit(cpu, &cpu_online_map);  /* was set in smp_callin() */
-        cpucount--;
-    }
+				printk("Not responding.\n");
+			inquire_remote_apic(apicid);
+		}
+	}
+	x86_cpu_to_apicid[cpu] = apicid;
+	if (boot_error) {
+		/* Try to put things back the way they were before ... */
+		unmap_cpu_to_logical_apicid(cpu);
+		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+		cpucount--;
+	}
+
+	/* mark "stuck" area as not stuck */
+	*((volatile unsigned long *)trampoline_base) = 0;
+
+	return boot_error;
 }
 
+#if 0
+cycles_t cacheflush_time;
+unsigned long cache_decay_ticks;
+
+static void smp_tune_scheduling (void)
+{
+	unsigned long cachesize;       /* kB   */
+	unsigned long bandwidth = 350; /* MB/s */
+	/*
+	 * Rough estimation for SMP scheduling, this is the number of
+	 * cycles it takes for a fully memory-limited process to flush
+	 * the SMP-local cache.
+	 *
+	 * (For a P5 this pretty much means we will choose another idle
+	 *  CPU almost always at wakeup time (this is due to the small
+	 *  L1 cache), on PIIs it's around 50-100 usecs, depending on
+	 *  the cache size)
+	 */
+
+	if (!cpu_khz) {
+		/*
+		 * this basically disables processor-affinity
+		 * scheduling on SMP without a TSC.
+		 */
+		cacheflush_time = 0;
+		return;
+	} else {
+		cachesize = boot_cpu_data.x86_cache_size;
+		if (cachesize == -1) {
+			cachesize = 16; /* Pentiums, 2x8kB cache */
+			bandwidth = 100;
+		}
+
+		cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth;
+	}
+
+	cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1;
+
+	printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n",
+		(long)cacheflush_time/(cpu_khz/1000),
+		((long)cacheflush_time*100/(cpu_khz/1000)) % 100);
+	printk("task migration cache decay timeout: %ld msecs.\n",
+		cache_decay_ticks);
+}
+#else
+#define smp_tune_scheduling() ((void)0)
+#endif
 
 /*
  * Cycle through the processors sending APIC IPIs to boot each.
@@ -776,178 +913,274 @@ static void __init do_boot_cpu (int apicid)
 
 static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio = NULL;
+void *xquad_portio;
+
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
-void __init smp_boot_cpus(void)
+static void __init smp_boot_cpus(unsigned int max_cpus)
 {
-    int apicid, bit;
-
-    /* Initialize the logical to physical CPU number mapping */
-    init_cpu_to_apicid();
-
-    /*
-     * Setup boot CPU information
-     */
-    smp_store_cpu_info(0); /* Final full version of the data */
-    printk("CPU%d booted\n", 0);
-
-    /*
-     * We have the boot CPU online for sure.
-     */
-    set_bit(0, &cpu_online_map);
-    boot_cpu_logical_apicid = logical_smp_processor_id();
-    map_cpu_to_boot_apicid(0, boot_cpu_apicid);
-
-    /*
-     * If we couldnt find an SMP configuration at boot time,
-     * get out of here now!
-     */
-    if (!smp_found_config || opt_nosmp) {
-        io_apic_irqs = 0;
-        phys_cpu_present_map = physid_mask_of_physid(0);
-        cpu_online_map = 1;
-        smp_num_cpus = 1;
-        if (APIC_init_uniprocessor())
-            printk("Local APIC not detected."
-                   " Using dummy APIC emulation.\n");
-        goto smp_done;
-    }
-
-    /*
-     * Should not be necessary because the MP table should list the boot
-     * CPU too, but we do it for the sake of robustness anyway.
-     */
-    if (!test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map)) {
-        printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
-               boot_cpu_physical_apicid);
-        physid_set(hard_smp_processor_id(), phys_cpu_present_map);
-    }
-
-    /*
-     * If we couldn't find a local APIC, then get out of here now!
-     */
-    if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
-        !test_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability)) {
-        printk("BIOS bug, local APIC #%d not detected!...\n",
-               boot_cpu_physical_apicid);
-        printk("... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
-        io_apic_irqs = 0;
-        phys_cpu_present_map = physid_mask_of_physid(0);
-        cpu_online_map = 1;
-        smp_num_cpus = 1;
-        goto smp_done;
-    }
-
-    verify_local_APIC();
-
-    /*
-     * If SMP should be disabled, then really disable it!
-     */
-    if (!max_cpus) {
-        smp_found_config = 0;
-        printk("SMP mode deactivated, forcing use of dummy APIC emulation.\n");
-        io_apic_irqs = 0;
-        phys_cpu_present_map = physid_mask_of_physid(0);
-        cpu_online_map = 1;
-        smp_num_cpus = 1;
-        goto smp_done;
-    }
-
-    connect_bsp_APIC();
-    setup_local_APIC();
-
-    if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid)
-        BUG();
-
-    /*
-     * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
-     *
-     * In clustered apic mode, phys_cpu_present_map is a constructed thus:
-     * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
-     * clustered apic ID.
-     */
-    Dprintk("CPU present map: %lx\n", phys_cpu_present_map);
-
-    for (bit = 0; bit < NR_CPUS; bit++) {
-        apicid = cpu_present_to_apicid(bit);
-        /*
-         * Don't even attempt to start the boot CPU!
-         */
-        if (apicid == boot_cpu_apicid)
-            continue;
-
-        /* 
-         * Don't start hyperthreads if option noht requested.
-         */
-        if (opt_noht && (apicid & (ht_per_core - 1)))
-            continue;
-
-        if (!check_apicid_present(bit))
-            continue;
-        if ((max_cpus >= 0) && (max_cpus <= cpucount+1))
-            continue;
-
-        do_boot_cpu(apicid);
-
-        /*
-         * Make sure we unmap all failed CPUs
-         */
-        if ((boot_apicid_to_cpu(apicid) == -1) &&
-            (!check_apicid_present(bit)))
-            printk("CPU #%d not responding - cannot use it.\n",
-                   apicid);
-    }
-
-    /*
-     * Cleanup possible dangling ends...
-     */
-    /*
-     * Install writable page 0 entry to set BIOS data area.
-     */
-    local_flush_tlb();
-
-    /*
-     * Paranoid:  Set warm reset code and vector here back
-     * to default values.
-     */
-    CMOS_WRITE(0, 0xf);
-
-    *((volatile long *) phys_to_virt(0x467)) = 0;
-
-    if (!cpucount) {
-        printk("Error: only one processor found.\n");
-    } else {
-        printk("Total of %d processors activated.\n", cpucount+1);
-    }
-    smp_num_cpus = cpucount + 1;
-
-    Dprintk("Boot done.\n");
-
-    /*
-     * Here we can be sure that there is an IO-APIC in the system. Let's
-     * go and set it up:
-     */
-    if ( nr_ioapics ) setup_IO_APIC();
-
-    /* Set up all local APIC timers in the system. */
-    {
-        extern void setup_APIC_clocks(void);
-        setup_APIC_clocks();
-    }
-
-    /* Synchronize the TSC with the AP(s). */
-    if ( cpucount ) synchronize_tsc_bp();
-
- smp_done:
-    ;
+	int apicid, cpu, bit, kicked;
+#ifdef BOGOMIPS
+	unsigned long bogosum = 0;
+#endif
+
+	/*
+	 * Setup boot CPU information
+	 */
+	smp_store_cpu_info(0); /* Final full version of the data */
+	printk("CPU%d: ", 0);
+	print_cpu_info(&cpu_data[0]);
+
+	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+	boot_cpu_logical_apicid = logical_smp_processor_id();
+	x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+
+	/*current_thread_info()->cpu = 0;*/
+	smp_tune_scheduling();
+	cpus_clear(cpu_sibling_map[0]);
+	cpu_set(0, cpu_sibling_map[0]);
+
+	/*
+	 * If we couldn't find an SMP configuration at boot time,
+	 * get out of here now!
+	 */
+	if (!smp_found_config && !acpi_lapic) {
+		printk(KERN_NOTICE "SMP motherboard not detected.\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		if (APIC_init_uniprocessor())
+			printk(KERN_NOTICE "Local APIC not detected."
+					   " Using dummy APIC emulation.\n");
+		map_cpu_to_logical_apicid();
+		return;
+	}
+
+	/*
+	 * Should not be necessary because the MP table should list the boot
+	 * CPU too, but we do it for the sake of robustness anyway.
+	 * Makes no sense to do this check in clustered apic mode, so skip it
+	 */
+	if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
+		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
+				boot_cpu_physical_apicid);
+		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+	}
+
+	/*
+	 * If we couldn't find a local APIC, then get out of here now!
+	 */
+	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+			boot_cpu_physical_apicid);
+		printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		return;
+	}
+
+	verify_local_APIC();
+
+	/*
+	 * If SMP should be disabled, then really disable it!
+	 */
+	if (!max_cpus) {
+		smp_found_config = 0;
+		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
+		smpboot_clear_io_apic_irqs();
+		phys_cpu_present_map = physid_mask_of_physid(0);
+		return;
+	}
+
+	connect_bsp_APIC();
+	setup_local_APIC();
+	map_cpu_to_logical_apicid();
+
+
+	setup_portio_remap();
+
+	/*
+	 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
+	 *
+	 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
+	 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
+	 * clustered apic ID.
+	 */
+	Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+
+	kicked = 1;
+	for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
+		apicid = cpu_present_to_apicid(bit);
+		/*
+		 * Don't even attempt to start the boot CPU!
+		 */
+		if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
+			continue;
+
+		if (!check_apicid_present(bit))
+			continue;
+		if (max_cpus <= cpucount+1)
+			continue;
+
+		if (do_boot_cpu(apicid))
+			printk("CPU #%d not responding - cannot use it.\n",
+								apicid);
+		else
+			++kicked;
+	}
+
+	/*
+	 * Cleanup possible dangling ends...
+	 */
+	smpboot_restore_warm_reset_vector();
+
+#ifdef BOGOMIPS
+	/*
+	 * Allow the user to impress friends.
+	 */
+	Dprintk("Before bogomips.\n");
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_isset(cpu, cpu_callout_map))
+			bogosum += cpu_data[cpu].loops_per_jiffy;
+	printk(KERN_INFO
+		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+		cpucount+1,
+		bogosum/(500000/HZ),
+		(bogosum/(5000/HZ))%100);
+#else
+	printk("Total of %d processors activated.\n", cpucount+1);
+#endif
+	
+	Dprintk("Before bogocount - setting activated=1.\n");
+
+	if (smp_b_stepping)
+		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	if (tainted & TAINT_UNSAFE_SMP) {
+		if (cpucount)
+			printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
+		else
+			tainted &= ~TAINT_UNSAFE_SMP;
+	}
+
+	Dprintk("Boot done.\n");
+
+	/*
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
+	 */
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpus_clear(cpu_sibling_map[cpu]);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+
+		if (smp_num_siblings > 1) {
+			for (i = 0; i < NR_CPUS; i++) {
+				if (!cpu_isset(i, cpu_callout_map))
+					continue;
+				if (phys_proc_id[cpu] == phys_proc_id[i]) {
+					siblings++;
+					cpu_set(i, cpu_sibling_map[cpu]);
+				}
+			}
+		} else {
+			siblings++;
+			cpu_set(cpu, cpu_sibling_map[cpu]);
+		}
+
+		if (siblings != smp_num_siblings)
+			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		check_nmi_watchdog();
+
+	smpboot_setup_io_apic();
+
+	setup_boot_APIC_clock();
+
+	/*
+	 * Synchronize the TSC with the AP
+	 */
+	if (cpu_has_tsc && cpucount && cpu_khz)
+		synchronize_tsc_bp();
 }
 
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: nil
- * End:
- */
+/* These are wrappers to interface to the new boot process.  Someone
+   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
+void __init smp_prepare_cpus(unsigned int max_cpus)
+{
+	smp_boot_cpus(max_cpus);
+}
+
+void __devinit smp_prepare_boot_cpu(void)
+{
+	cpu_set(smp_processor_id(), cpu_online_map);
+	cpu_set(smp_processor_id(), cpu_callout_map);
+}
+
+int __devinit __cpu_up(unsigned int cpu)
+{
+	/* This only works at boot for x86.  See "rewrite" above. */
+	if (cpu_isset(cpu, smp_commenced_mask)) {
+		local_irq_enable();
+		return -ENOSYS;
+	}
+
+	/* In case one didn't come up */
+	if (!cpu_isset(cpu, cpu_callin_map)) {
+		local_irq_enable();
+		return -EIO;
+	}
+
+	local_irq_enable();
+	/* Unleash the CPU! */
+	cpu_set(cpu, smp_commenced_mask);
+	while (!cpu_isset(cpu, cpu_online_map))
+		mb();
+	return 0;
+}
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
+#ifdef CONFIG_X86_IO_APIC
+	setup_ioapic_dest();
+#endif
+#ifdef CONFIG_X86_64
+	zap_low_mappings();
+#endif
+	/*
+	 * Disable executability of the SMP trampoline:
+	 */
+	set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+}
+
+#if 0
+void __init smp_intr_init(void)
+{
+	/*
+	 * IRQ0 must be given a fixed assignment and initialized,
+	 * because it's used before the IO-APIC is set up.
+	 */
+	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPI for invalidation */
+	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+	/* IPI for generic function call */
+	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+}
+#endif
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index d9a6a5999f..3e3b770ae4 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -37,7 +37,6 @@ unsigned long cpu_khz;  /* Detected as we calibrate the TSC */
 unsigned long ticks_per_usec; /* TSC ticks per microsecond. */
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 int timer_ack = 0;
-int do_timer_lists_from_pit = 0;
 unsigned long volatile jiffies;
 
 /* PRIVATE */
@@ -91,7 +90,7 @@ void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
     write_unlock_irq(&time_lock);
 
     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
-    if ( do_timer_lists_from_pit )
+    if ( !cpu_has_apic )
         raise_softirq(AC_TIMER_SOFTIRQ);
 }
 
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index fc2ee40d7b..7907fe269d 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -99,6 +99,7 @@ integer_param("debug_stack_lines", debug_stack_lines);
 
 static inline int kernel_text_address(unsigned long addr)
 {
+    extern char _stext, _etext;
     if (addr >= (unsigned long) &_stext &&
         addr <= (unsigned long) &_etext)
         return 1;
diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c
index f3a3b541ef..7c814c8ec9 100644
--- a/xen/arch/x86/vmx.c
+++ b/xen/arch/x86/vmx.c
@@ -22,10 +22,10 @@
 #include <xen/lib.h>
 #include <xen/trace.h>
 #include <xen/sched.h>
+#include <xen/irq.h>
 #include <xen/softirq.h>
 #include <asm/current.h>
 #include <asm/io.h>
-#include <asm/irq.h>
 #include <asm/shadow.h>
 #include <asm/regs.h>
 #include <asm/cpufeature.h>
@@ -49,7 +49,7 @@ extern long evtchn_send(int lport);
 extern long do_block(void);
 void do_nmi(struct cpu_user_regs *, unsigned long);
 
-int start_vmx()
+int start_vmx(void)
 {
     struct vmcs_struct *vmcs;
     u32 ecx;
@@ -70,12 +70,14 @@ int start_vmx()
     if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) {
         if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) {
                 printk("VMX disabled by Feature Control MSR.\n");
-		return 0;
+                return 0;
         }
     }
-    else 
+    else {
         wrmsr(IA32_FEATURE_CONTROL_MSR, 
-              IA32_FEATURE_CONTROL_MSR_LOCK | IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
+              IA32_FEATURE_CONTROL_MSR_LOCK |
+              IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0);
+    }
 
     set_in_cr4(X86_CR4_VMXE);   /* Enable VMXE */
 
@@ -93,7 +95,7 @@ int start_vmx()
     return 1;
 }
 
-void stop_vmx()
+void stop_vmx(void)
 {
     if (read_cr4() & X86_CR4_VMXE)
         __vmxoff();
@@ -167,7 +169,7 @@ static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs)
     return result;
 }
 
-static void vmx_do_no_device_fault() 
+static void vmx_do_no_device_fault(void)
 {
     unsigned long cr0;
         
diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c
index 4ffd4061fb..f46856fb2b 100644
--- a/xen/common/ac_timer.c
+++ b/xen/common/ac_timer.c
@@ -125,7 +125,7 @@ static int add_entry(struct ac_timer **heap, struct ac_timer *t)
         struct ac_timer **new_heap = xmalloc_array(struct ac_timer *, limit);
         if ( new_heap == NULL ) BUG();
         memcpy(new_heap, heap, (limit>>1)*sizeof(struct ac_timer *));
-        for ( i = 0; i < smp_num_cpus; i++ )
+        for ( i = 0; i < NR_CPUS; i++ )
             if ( ac_timers[i].heap == heap )
                 ac_timers[i].heap = new_heap;
         xfree(heap);
@@ -248,7 +248,7 @@ static void dump_timerq(unsigned char key)
     printk("Dumping ac_timer queues: NOW=0x%08X%08X\n",
            (u32)(now>>32), (u32)now); 
 
-    for ( i = 0; i < smp_num_cpus; i++ )
+    for_each_online_cpu( i )
     {
         printk("CPU[%02d] ", i);
         spin_lock_irqsave(&ac_timers[i].lock, flags);
@@ -270,7 +270,7 @@ void __init ac_timer_init(void)
 
     open_softirq(AC_TIMER_SOFTIRQ, ac_timer_softirq_action);
 
-    for ( i = 0; i < smp_num_cpus; i++ )
+    for ( i = 0; i < NR_CPUS; i++ )
     {
         ac_timers[i].heap = xmalloc_array(
             struct ac_timer *, DEFAULT_HEAP_LIMIT+1);
diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c
index df92bea133..20cef35e29 100644
--- a/xen/common/dom0_ops.c
+++ b/xen/common/dom0_ops.c
@@ -155,7 +155,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
         unsigned int        pro;
         domid_t             dom;
         struct exec_domain *ed;
-        unsigned int        i, ht, cnt[NR_CPUS] = { 0 };
+        unsigned int        i, cnt[NR_CPUS] = { 0 };
 
 
         dom = op->u.createdomain.domain;
@@ -182,9 +182,8 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
          * domains will all share the second HT of each CPU. Since dom0 is on 
 	     * CPU 0, we favour high numbered CPUs in the event of a tie.
          */
-        ht = opt_noht ? 1 : ht_per_core;
-        pro = ht-1;
-        for ( i = pro; i < smp_num_cpus; i += ht )
+        pro = ht_per_core - 1;
+        for ( i = pro; i < num_online_cpus(); i += ht_per_core )
             if ( cnt[i] <= cnt[pro] )
                 pro = i;
 
@@ -269,7 +268,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
         else
         {
             /* pick a new cpu from the usable map */
-            int new_cpu = (int)find_first_set_bit(cpumap) % smp_num_cpus;
+            int new_cpu = (int)find_first_set_bit(cpumap) % num_online_cpus();
 
             exec_domain_pause(ed);
             if ( ed->processor != new_cpu )
diff --git a/xen/common/domain.c b/xen/common/domain.c
index b7f104353c..835154051b 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -50,7 +50,10 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu)
     INIT_LIST_HEAD(&d->page_list);
     INIT_LIST_HEAD(&d->xenpage_list);
 
-    if ( (d->domain_id != IDLE_DOMAIN_ID) &&
+    if ( d->domain_id == IDLE_DOMAIN_ID )
+        set_bit(_DOMF_idle_domain, &d->domain_flags);
+
+    if ( !is_idle_task(d) &&
          ((init_event_channels(d) != 0) || (grant_table_create(d) != 0)) )
     {
         destroy_event_channels(d);
@@ -62,7 +65,7 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu)
     
     sched_add_domain(ed);
 
-    if ( d->domain_id != IDLE_DOMAIN_ID )
+    if ( !is_idle_task(d) )
     {
         write_lock(&domlist_lock);
         pd = &domain_list; /* NB. domain_list maintained in order of dom_id. */
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 72b25bd0ea..5b388cafbf 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -45,8 +45,8 @@ string_param("badpage", opt_badpage);
 #define round_pgdown(_p)  ((_p)&PAGE_MASK)
 #define round_pgup(_p)    (((_p)+(PAGE_SIZE-1))&PAGE_MASK)
 
-static spinlock_t page_scrub_lock;
-struct list_head page_scrub_list;
+static spinlock_t page_scrub_lock = SPIN_LOCK_UNLOCKED;
+LIST_HEAD(page_scrub_list);
 
 /*********************
  * ALLOCATION BITMAP
@@ -675,8 +675,6 @@ static void page_scrub_softirq(void)
 
 static __init int page_scrub_init(void)
 {
-    spin_lock_init(&page_scrub_lock);
-    INIT_LIST_HEAD(&page_scrub_list);
     open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq);
     return 0;
 }
diff --git a/xen/common/perfc.c b/xen/common/perfc.c
index 157d49ffc8..7363fb98c7 100644
--- a/xen/common/perfc.c
+++ b/xen/common/perfc.c
@@ -55,10 +55,11 @@ void perfc_printall(unsigned char key)
             break;
         case TYPE_CPU:
         case TYPE_S_CPU:
-            for ( j = sum = 0; j < smp_num_cpus; j++ )
+            sum = 0;
+            for_each_online_cpu ( j )
                 sum += atomic_read(&counters[j]);
             printk("TOTAL[%10d]  ", sum);
-            for ( j = 0; j < smp_num_cpus; j++ )
+            for_each_online_cpu ( j )
                 printk("CPU%02d[%10d]  ", j, atomic_read(&counters[j]));
             counters += NR_CPUS;
             break;
@@ -84,7 +85,7 @@ void perfc_printall(unsigned char key)
 
 void perfc_reset(unsigned char key)
 {
-    int i, j, sum;
+    int i, j;
     s_time_t now = NOW();
     atomic_t *counters = (atomic_t *)&perfcounters;
 
@@ -104,13 +105,13 @@ void perfc_reset(unsigned char key)
             counters += 1;
             break;
         case TYPE_CPU:
-            for ( j = sum = 0; j < smp_num_cpus; j++ )
+            for ( j = 0; j < NR_CPUS; j++ )
                 atomic_set(&counters[j],0);
         case TYPE_S_CPU:
             counters += NR_CPUS;
             break;
         case TYPE_ARRAY:
-            for ( j = sum = 0; j < perfc_info[i].nr_elements; j++ )
+            for ( j = 0; j < NR_CPUS; j++ )
                 atomic_set(&counters[j],0);
         case TYPE_S_ARRAY:
             counters += perfc_info[i].nr_elements;
@@ -146,7 +147,7 @@ static int perfc_copy_info(dom0_perfc_desc_t *desc)
                 break;
             case TYPE_CPU:
             case TYPE_S_CPU:
-                perfc_d[i].nr_vals = smp_num_cpus;
+                perfc_d[i].nr_vals = num_online_cpus();
                 break;
             case TYPE_ARRAY:
             case TYPE_S_ARRAY:
diff --git a/xen/common/sched_bvt.c b/xen/common/sched_bvt.c
index 1ad20578f4..227804ebaf 100644
--- a/xen/common/sched_bvt.c
+++ b/xen/common/sched_bvt.c
@@ -169,14 +169,19 @@ static inline u32 calc_evt(struct exec_domain *d, u32 avt)
 static int bvt_alloc_task(struct exec_domain *ed)
 {
     struct domain *d = ed->domain;
-    if ( (d->sched_priv == NULL) ) {
+
+    if ( (d->sched_priv == NULL) )
+    {
         if ( (d->sched_priv = xmalloc(struct bvt_dom_info)) == NULL )
             return -1;
         memset(d->sched_priv, 0, sizeof(struct bvt_dom_info));
     }
+
     ed->sched_priv = &BVT_INFO(d)->ed_inf[ed->vcpu_id];
+
     BVT_INFO(d)->ed_inf[ed->vcpu_id].inf = BVT_INFO(d);
     BVT_INFO(d)->ed_inf[ed->vcpu_id].exec_domain = ed;
+
     return 0;
 }
 
@@ -190,6 +195,15 @@ static void bvt_add_task(struct exec_domain *d)
     ASSERT(inf != NULL);
     ASSERT(d   != NULL);
 
+    /* Allocate per-CPU context if this is the first domain to be added. */
+    if ( CPU_INFO(d->processor) == NULL )
+    {
+        schedule_data[d->processor].sched_priv = xmalloc(struct bvt_cpu_info);
+        BUG_ON(CPU_INFO(d->processor) == NULL);
+        INIT_LIST_HEAD(RUNQUEUE(d->processor));
+        CPU_SVT(d->processor) = 0;
+    }
+
     if ( d->vcpu_id == 0 )
     {
         inf->mcu_advance = MCU_ADVANCE;
@@ -213,9 +227,11 @@ static void bvt_add_task(struct exec_domain *d)
 
     einf->exec_domain = d;
 
-    if ( d->domain->domain_id == IDLE_DOMAIN_ID )
+    if ( is_idle_task(d->domain) )
     {
         einf->avt = einf->evt = ~0U;
+        BUG_ON(__task_on_runqueue(d));
+        __add_to_runqueue_head(d);
     } 
     else 
     {
@@ -225,20 +241,6 @@ static void bvt_add_task(struct exec_domain *d)
     }
 }
 
-static int bvt_init_idle_task(struct exec_domain *ed)
-{
-    if ( bvt_alloc_task(ed) < 0 )
-        return -1;
-
-    bvt_add_task(ed);
-
-    set_bit(_VCPUF_running, &ed->vcpu_flags);
-    if ( !__task_on_runqueue(ed) )
-        __add_to_runqueue_head(ed);
-
-    return 0;
-}
-
 static void bvt_wake(struct exec_domain *ed)
 {
     struct bvt_edom_info *einf = EBVT_INFO(ed);
@@ -548,36 +550,11 @@ static void bvt_dump_cpu_state(int i)
     }
 }
 
-/* Initialise the data structures. */
-static int bvt_init_scheduler(void)
-{
-    int i;
-
-    for ( i = 0; i < NR_CPUS; i++ )
-    {
-        schedule_data[i].sched_priv = xmalloc(struct bvt_cpu_info);
-       
-        if ( schedule_data[i].sched_priv == NULL )
-        {
-            printk("Failed to allocate BVT scheduler per-CPU memory!\n");
-            return -1;
-        }
-
-        INIT_LIST_HEAD(RUNQUEUE(i));
-        
-        CPU_SVT(i) = 0; /* XXX do I really need to do this? */
-    }
-
-    return 0;
-}
-
 struct scheduler sched_bvt_def = {
     .name     = "Borrowed Virtual Time",
     .opt_name = "bvt",
     .sched_id = SCHED_BVT,
     
-    .init_scheduler = bvt_init_scheduler,
-    .init_idle_task = bvt_init_idle_task,
     .alloc_task     = bvt_alloc_task,
     .add_task       = bvt_add_task,
     .free_task      = bvt_free_task,
diff --git a/xen/common/sched_sedf.c b/xen/common/sched_sedf.c
index d4ed67ed5b..3ea2db1522 100644
--- a/xen/common/sched_sedf.c
+++ b/xen/common/sched_sedf.c
@@ -13,20 +13,18 @@
 #include <xen/time.h>
 #include <xen/slab.h>
 
-/*#include <xen/adv_sched_hist.h>*/
-
 /*verbosity settings*/
 #define SEDFLEVEL 0
 #define PRINT(_f, _a...)  \
-if ((_f)<=SEDFLEVEL) printk(_a );
+    if ((_f)<=SEDFLEVEL) printk(_a );
 
 #ifndef NDEBUG
-	#define SEDF_STATS
-	#define CHECK(_p) if ( !(_p) ) \
-	{ printk("Check '%s' failed, line %d, file %s\n", #_p , __LINE__,\
-	__FILE__);}
+#define SEDF_STATS
+#define CHECK(_p) if ( !(_p) ) \
+ { printk("Check '%s' failed, line %d, file %s\n", #_p , __LINE__,\
+ __FILE__);}
 #else
-	#define CHECK(_p) ((void)0)
+#define CHECK(_p) ((void)0)
 #endif
 
 /*various ways of unblocking domains*/
@@ -64,72 +62,72 @@ if ((_f)<=SEDFLEVEL) printk(_a );
 
 
 struct sedf_dom_info {
-	struct domain		*domain;
+    struct domain  *domain;
 };
 struct sedf_edom_info
 {
-	struct exec_domain	*exec_domain;
-	struct list_head	list;
-	struct list_head	extralist[2];
-	
-	/*Parameters for EDF*/
-	s_time_t		period;		/*=(relative deadline)*/
-	s_time_t		slice;		/*=worst case execution time*/
-	
-	/*Advaced Parameters*/
-	/*Latency Scaling*/
-	s_time_t		period_orig;	
-	s_time_t		slice_orig;
-	s_time_t		latency;
-	
-	/*status of domain*/
-	int			status;
-	/*weights for "Scheduling for beginners/ lazy/ etc." ;)*/
-	short			weight;
-        short                   extraweight;
-        /*Bookkeeping*/
-	s_time_t		deadl_abs;
-	s_time_t		sched_start_abs;
-	s_time_t		cputime;
-	/* times the domain un-/blocked */
-	s_time_t		block_abs;
-	s_time_t		unblock_abs;
-	
-	/*scores for {util, block penalty}-weighted extratime distribution*/
-	int			score[2];	
-	s_time_t		short_block_lost_tot;
-	
-	/*Statistics*/
-	s_time_t		extra_time_tot;
+    struct exec_domain *exec_domain;
+    struct list_head list;
+    struct list_head extralist[2];
+ 
+    /*Parameters for EDF*/
+    s_time_t  period;  /*=(relative deadline)*/
+    s_time_t  slice;  /*=worst case execution time*/
+ 
+    /*Advaced Parameters*/
+    /*Latency Scaling*/
+    s_time_t  period_orig; 
+    s_time_t  slice_orig;
+    s_time_t  latency;
+ 
+    /*status of domain*/
+    int   status;
+    /*weights for "Scheduling for beginners/ lazy/ etc." ;)*/
+    short   weight;
+    short                   extraweight;
+    /*Bookkeeping*/
+    s_time_t  deadl_abs;
+    s_time_t  sched_start_abs;
+    s_time_t  cputime;
+    /* times the domain un-/blocked */
+    s_time_t  block_abs;
+    s_time_t  unblock_abs;
+ 
+    /*scores for {util, block penalty}-weighted extratime distribution*/
+    int   score[2]; 
+    s_time_t  short_block_lost_tot;
+ 
+    /*Statistics*/
+    s_time_t  extra_time_tot;
 
 #ifdef SEDF_STATS
-	s_time_t		block_time_tot;
-	s_time_t		penalty_time_tot;
-	int			block_tot;
-	int			short_block_tot;
-	int			long_block_tot;
-	int			short_cont;
-	int			pen_extra_blocks;
-	int			pen_extra_slices;
+    s_time_t  block_time_tot;
+    s_time_t  penalty_time_tot;
+    int   block_tot;
+    int   short_block_tot;
+    int   long_block_tot;
+    int   short_cont;
+    int   pen_extra_blocks;
+    int   pen_extra_slices;
 #endif
 };
 
 struct sedf_cpu_info {
-	struct list_head runnableq;
-	struct list_head waitq;
-	struct list_head extraq[2];
+    struct list_head runnableq;
+    struct list_head waitq;
+    struct list_head extraq[2];
 };
 
-#define EDOM_INFO(d)		((struct sedf_edom_info *)((d)->sched_priv))
-#define CPU_INFO(cpu)	((struct sedf_cpu_info *)schedule_data[cpu].sched_priv)
-#define LIST(d)			(&EDOM_INFO(d)->list)
-#define EXTRALIST(d,i)		(&(EDOM_INFO(d)->extralist[i]))
-#define RUNQ(cpu)   		(&CPU_INFO(cpu)->runnableq)
-#define WAITQ(cpu)   		(&CPU_INFO(cpu)->waitq)
-#define EXTRAQ(cpu,i)  		(&(CPU_INFO(cpu)->extraq[i]))
-#define IDLETASK(cpu)		((struct exec_domain *)schedule_data[cpu].idle)
+#define EDOM_INFO(d)  ((struct sedf_edom_info *)((d)->sched_priv))
+#define CPU_INFO(cpu) ((struct sedf_cpu_info *)schedule_data[cpu].sched_priv)
+#define LIST(d)   (&EDOM_INFO(d)->list)
+#define EXTRALIST(d,i)  (&(EDOM_INFO(d)->extralist[i]))
+#define RUNQ(cpu)     (&CPU_INFO(cpu)->runnableq)
+#define WAITQ(cpu)     (&CPU_INFO(cpu)->waitq)
+#define EXTRAQ(cpu,i)    (&(CPU_INFO(cpu)->extraq[i]))
+#define IDLETASK(cpu)  ((struct exec_domain *)schedule_data[cpu].idle)
 
-#define PERIOD_BEGIN(inf)	((inf)->deadl_abs - (inf)->period)
+#define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period)
 
 #define MIN(x,y) (((x)<(y))?(x):(y))
 #define DIV_UP(x,y) (((x) + (y) - 1) / y)
@@ -142,8 +140,8 @@ struct sedf_cpu_info {
 static void sedf_dump_cpu_state(int i);
 
 static inline int extraq_on(struct exec_domain *d, int i) {
-	return ((EXTRALIST(d,i)->next != NULL) &&
-		(EXTRALIST(d,i)->next != EXTRALIST(d,i)));
+    return ((EXTRALIST(d,i)->next != NULL) &&
+            (EXTRALIST(d,i)->next != EXTRALIST(d,i)));
 }
 
 static inline void extraq_add_head(struct exec_domain *d, int i)
@@ -160,13 +158,13 @@ static inline void extraq_add_tail(struct exec_domain *d, int i)
 
 static inline void extraq_del(struct exec_domain *d, int i)
 {
-	struct list_head *list = EXTRALIST(d,i);
-	ASSERT(extraq_on(d,i));
-	PRINT(3, "Removing domain %i.%i from L%i extraq\n", d->domain->domain_id,
-	   d->vcpu_id, i);	
-	list_del(list);
-	list->next = NULL;
-	ASSERT(!extraq_on(d, i));
+    struct list_head *list = EXTRALIST(d,i);
+    ASSERT(extraq_on(d,i));
+    PRINT(3, "Removing domain %i.%i from L%i extraq\n", d->domain->domain_id,
+          d->vcpu_id, i); 
+    list_del(list);
+    list->next = NULL;
+    ASSERT(!extraq_on(d, i));
 }
 
 /* adds a domain to the queue of processes which are aware of extra time. List
@@ -176,92 +174,92 @@ static inline void extraq_del(struct exec_domain *d, int i)
    charging each domain that recieved extratime with an inverse of its weight.
  */ 
 static inline void extraq_add_sort_update(struct exec_domain *d, int i, int sub) {
-	struct list_head      *cur;
-	struct sedf_edom_info *curinf;
-	
-	ASSERT(!extraq_on(d,i));
-	PRINT(3, "Adding domain %i.%i (score= %i, short_pen= %"PRIi64")"
-              " to L%i extraq\n",
-              d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->score[i],
-              EDOM_INFO(d)->short_block_lost_tot, i);	
-	/*iterate through all elements to find our "hole" and on our way
-	  update all the other scores*/
-	list_for_each(cur,EXTRAQ(d->processor,i)){
-		curinf = list_entry(cur,struct sedf_edom_info,extralist[i]);
-		curinf->score[i] -= sub;
-		if (EDOM_INFO(d)->score[i] < curinf->score[i])
-	 		break;
-		else
-			PRINT(4,"\tbehind domain %i.%i (score= %i)\n",
-			      curinf->exec_domain->domain->domain_id,
-			      curinf->exec_domain->vcpu_id, curinf->score[i]);
-	}
-	/*cur now contains the element, before which we'll enqueue*/
-	PRINT(3, "\tlist_add to %p\n", cur->prev);
-	list_add(EXTRALIST(d,i),cur->prev);
-	
-	/*continue updating the extraq*/
-	if ((cur != EXTRAQ(d->processor,i)) && sub)
-		for (cur = cur->next; cur != EXTRAQ(d->processor,i);
-		     cur = cur-> next) {
-			curinf = list_entry(cur,struct sedf_edom_info,
-				extralist[i]);
-			curinf->score[i] -= sub;
-			PRINT(4, "\tupdating domain %i.%i (score= %u)\n",
-			      curinf->exec_domain->domain->domain_id, 
-			      curinf->exec_domain->vcpu_id, curinf->score[i]);
-		}
-	ASSERT(extraq_on(d,i));
+    struct list_head      *cur;
+    struct sedf_edom_info *curinf;
+ 
+    ASSERT(!extraq_on(d,i));
+    PRINT(3, "Adding domain %i.%i (score= %i, short_pen= %"PRIi64")"
+          " to L%i extraq\n",
+          d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->score[i],
+          EDOM_INFO(d)->short_block_lost_tot, i); 
+    /*iterate through all elements to find our "hole" and on our way
+      update all the other scores*/
+    list_for_each(cur,EXTRAQ(d->processor,i)){
+        curinf = list_entry(cur,struct sedf_edom_info,extralist[i]);
+        curinf->score[i] -= sub;
+        if (EDOM_INFO(d)->score[i] < curinf->score[i])
+            break;
+        else
+            PRINT(4,"\tbehind domain %i.%i (score= %i)\n",
+                  curinf->exec_domain->domain->domain_id,
+                  curinf->exec_domain->vcpu_id, curinf->score[i]);
+    }
+    /*cur now contains the element, before which we'll enqueue*/
+    PRINT(3, "\tlist_add to %p\n", cur->prev);
+    list_add(EXTRALIST(d,i),cur->prev);
+ 
+    /*continue updating the extraq*/
+    if ((cur != EXTRAQ(d->processor,i)) && sub)
+        for (cur = cur->next; cur != EXTRAQ(d->processor,i);
+             cur = cur-> next) {
+            curinf = list_entry(cur,struct sedf_edom_info,
+                                extralist[i]);
+            curinf->score[i] -= sub;
+            PRINT(4, "\tupdating domain %i.%i (score= %u)\n",
+                  curinf->exec_domain->domain->domain_id, 
+                  curinf->exec_domain->vcpu_id, curinf->score[i]);
+        }
+    ASSERT(extraq_on(d,i));
 }
 static inline void extraq_check(struct exec_domain *d) {
-	if (extraq_on(d, EXTRA_UTIL_Q)) {
-		PRINT(2,"Dom %i.%i is on L1 extraQ\n",d->domain->domain_id, d->vcpu_id);
-		if (!(EDOM_INFO(d)->status & EXTRA_AWARE) &&
-		    !extra_runs(EDOM_INFO(d))) {
-			extraq_del(d, EXTRA_UTIL_Q);
-			PRINT(2,"Removed dom %i.%i from L1 extraQ\n",
-			      d->domain->domain_id, d->vcpu_id);
-		}
-	} else {
-		PRINT(2,"Dom %i.%i is NOT on L1 extraQ\n",d->domain->domain_id,
-		      d->vcpu_id);
-		if ((EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d))
-		{
-			#if (EXTRA == EXTRA_ROUNDR)
-			extraq_add_tail(d, EXTRA_UTIL_Q);
-			#elif (EXTRA == EXTRA_SLICE_WEIGHT || \
-			       EXTRA == EXTRA_BLOCK_WEIGHT)
-			extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
-			#elif
-			;
-			#endif
-			PRINT(2,"Added dom %i.%i to L1 extraQ\n",d->domain->domain_id,
-			      d->vcpu_id);
-		}
-	}
+    if (extraq_on(d, EXTRA_UTIL_Q)) {
+        PRINT(2,"Dom %i.%i is on L1 extraQ\n",d->domain->domain_id, d->vcpu_id);
+        if (!(EDOM_INFO(d)->status & EXTRA_AWARE) &&
+            !extra_runs(EDOM_INFO(d))) {
+            extraq_del(d, EXTRA_UTIL_Q);
+            PRINT(2,"Removed dom %i.%i from L1 extraQ\n",
+                  d->domain->domain_id, d->vcpu_id);
+        }
+    } else {
+        PRINT(2,"Dom %i.%i is NOT on L1 extraQ\n",d->domain->domain_id,
+              d->vcpu_id);
+        if ((EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d))
+        {
+#if (EXTRA == EXTRA_ROUNDR)
+            extraq_add_tail(d, EXTRA_UTIL_Q);
+#elif (EXTRA == EXTRA_SLICE_WEIGHT || \
+          EXTRA == EXTRA_BLOCK_WEIGHT)
+            extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
+#elif
+            ;
+#endif
+            PRINT(2,"Added dom %i.%i to L1 extraQ\n",d->domain->domain_id,
+                  d->vcpu_id);
+        }
+    }
 }
 
 static inline void extraq_check_add_unblocked(struct exec_domain *d, 
-    int priority) {
-	struct sedf_edom_info *inf = EDOM_INFO(d);
-	if (inf->status & EXTRA_AWARE) 
-	#if (EXTRA == EXTRA_ROUNDR)
-		if (priority)
-			extraq_add_head(d,EXTRA_UTIL_Q);
-		else
-			extraq_add_tail(d,EXTRA_UTIL_Q);
-	#elif (EXTRA == EXTRA_SLICE_WEIGHT \
-	    || EXTRA == EXTRA_BLOCK_WEIGHT)
-		/*put in on the weighted extraq, 
-		  without updating any scores*/
-		extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
-	#else
-		;
-	#endif
+                                              int priority) {
+    struct sedf_edom_info *inf = EDOM_INFO(d);
+    if (inf->status & EXTRA_AWARE) 
+#if (EXTRA == EXTRA_ROUNDR)
+        if (priority)
+            extraq_add_head(d,EXTRA_UTIL_Q);
+        else
+            extraq_add_tail(d,EXTRA_UTIL_Q);
+#elif (EXTRA == EXTRA_SLICE_WEIGHT \
+     || EXTRA == EXTRA_BLOCK_WEIGHT)
+    /*put in on the weighted extraq, 
+    without updating any scores*/
+    extraq_add_sort_update(d, EXTRA_UTIL_Q, 0);
+#else
+    ;
+#endif
 }
 
 static inline int __task_on_queue(struct exec_domain *d) {
-	return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d)));
+    return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d)));
 }
 static inline void __del_from_queue(struct exec_domain *d)
 {
@@ -277,41 +275,41 @@ static inline void __del_from_queue(struct exec_domain *d)
 typedef int(*list_comparer)(struct list_head* el1, struct list_head* el2);
 
 static inline void list_insert_sort(struct list_head *list,
-    struct list_head *element, list_comparer comp) {
-	struct list_head     *cur;
-	/*iterate through all elements to find our "hole"*/
-	list_for_each(cur,list){
-		if (comp(element, cur) < 0)
-	 		break;
-	}
-	/*cur now contains the element, before which we'll enqueue*/
-	PRINT(3,"\tlist_add to %p\n",cur->prev);
-	list_add(element, cur->prev);
+                                    struct list_head *element, list_comparer comp) {
+    struct list_head     *cur;
+    /*iterate through all elements to find our "hole"*/
+    list_for_each(cur,list){
+        if (comp(element, cur) < 0)
+            break;
+    }
+    /*cur now contains the element, before which we'll enqueue*/
+    PRINT(3,"\tlist_add to %p\n",cur->prev);
+    list_add(element, cur->prev);
 }  
 #define DOMAIN_COMPARER(name, field, comp1, comp2)          \
 int name##_comp(struct list_head* el1, struct list_head* el2) \
 {                                                           \
-	struct sedf_edom_info *d1, *d2;                     \
-	d1 = list_entry(el1,struct sedf_edom_info, field);  \
-	d2 = list_entry(el2,struct sedf_edom_info, field);  \
-	if ((comp1) == (comp2))                             \
-		return 0;                                   \
-	if ((comp1) < (comp2))                              \
-		return -1;                                  \
-	else                                                \
-		return 1;                                   \
+ struct sedf_edom_info *d1, *d2;                     \
+ d1 = list_entry(el1,struct sedf_edom_info, field);  \
+ d2 = list_entry(el2,struct sedf_edom_info, field);  \
+ if ((comp1) == (comp2))                             \
+  return 0;                                   \
+ if ((comp1) < (comp2))                              \
+  return -1;                                  \
+ else                                                \
+  return 1;                                   \
 }
 /* adds a domain to the queue of processes which wait for the beginning of the
    next period; this list is therefore sortet by this time, which is simply
    absol. deadline - period
  */ 
 DOMAIN_COMPARER(waitq, list, PERIOD_BEGIN(d1), PERIOD_BEGIN(d2))
-static inline void __add_to_waitqueue_sort(struct exec_domain *d) {
-	ASSERT(!__task_on_queue(d));
-	PRINT(3,"Adding domain %i.%i (bop= %"PRIu64") to waitq\n",
-              d->domain->domain_id, d->vcpu_id, PERIOD_BEGIN(EDOM_INFO(d)));
-	list_insert_sort(WAITQ(d->processor), LIST(d), waitq_comp);
-	ASSERT(__task_on_queue(d));
+    static inline void __add_to_waitqueue_sort(struct exec_domain *d) {
+    ASSERT(!__task_on_queue(d));
+    PRINT(3,"Adding domain %i.%i (bop= %"PRIu64") to waitq\n",
+          d->domain->domain_id, d->vcpu_id, PERIOD_BEGIN(EDOM_INFO(d)));
+    list_insert_sort(WAITQ(d->processor), LIST(d), waitq_comp);
+    ASSERT(__task_on_queue(d));
 }
 
 /* adds a domain to the queue of processes which have started their current
@@ -320,247 +318,228 @@ static inline void __add_to_waitqueue_sort(struct exec_domain *d) {
    task will run. As we are implementing EDF, this list is sorted by deadlines.
  */ 
 DOMAIN_COMPARER(runq, list, d1->deadl_abs, d2->deadl_abs)
-static inline void __add_to_runqueue_sort(struct exec_domain *d) {
-	PRINT(3,"Adding domain %i.%i (deadl= %"PRIu64") to runq\n",
-              d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->deadl_abs);
-	list_insert_sort(RUNQ(d->processor), LIST(d), runq_comp);
-}
-
-/* Initialises the queues */
-static int sedf_init_scheduler() {
-	int i;
-	PRINT(2,"sedf_init_scheduler was called\n");
-	
-	for ( i = 0; i < NR_CPUS; i++ ) {
-		schedule_data[i].sched_priv = 
-			xmalloc(struct sedf_cpu_info);
-		if ( schedule_data[i].sched_priv == NULL )
-			return -1;
-		INIT_LIST_HEAD(WAITQ(i));
-		INIT_LIST_HEAD(RUNQ(i));
-		INIT_LIST_HEAD(EXTRAQ(i,EXTRA_PEN_Q));
-		INIT_LIST_HEAD(EXTRAQ(i,EXTRA_UTIL_Q));
-	}
-	return 0;   
+    static inline void __add_to_runqueue_sort(struct exec_domain *d) {
+    PRINT(3,"Adding domain %i.%i (deadl= %"PRIu64") to runq\n",
+          d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->deadl_abs);
+    list_insert_sort(RUNQ(d->processor), LIST(d), runq_comp);
 }
 
 /* Allocates memory for per domain private scheduling data*/
 static int sedf_alloc_task(struct exec_domain *d) {
-	PRINT(2,"sedf_alloc_task was called, domain-id %i.%i\n",d->domain->domain_id,
-	      d->vcpu_id);
-	if (d->domain->sched_priv == NULL) {
-		if ((d->domain->sched_priv = 
-		     xmalloc(struct sedf_dom_info)) == NULL )
-		return -1;
-		memset(d->domain->sched_priv, 0, sizeof(struct sedf_dom_info));
-	}
-	if ((d->sched_priv = xmalloc(struct sedf_edom_info)) == NULL )
-		return -1;
-	memset(d->sched_priv, 0, sizeof(struct sedf_edom_info));
-	return 0;
+    PRINT(2,"sedf_alloc_task was called, domain-id %i.%i\n",d->domain->domain_id,
+          d->vcpu_id);
+    if (d->domain->sched_priv == NULL) {
+        if ((d->domain->sched_priv = 
+             xmalloc(struct sedf_dom_info)) == NULL )
+            return -1;
+        memset(d->domain->sched_priv, 0, sizeof(struct sedf_dom_info));
+    }
+    if ((d->sched_priv = xmalloc(struct sedf_edom_info)) == NULL )
+        return -1;
+    memset(d->sched_priv, 0, sizeof(struct sedf_edom_info));
+    return 0;
 }
 
 /* Setup the sedf_dom_info */
 static void sedf_add_task(struct exec_domain *d)
 {
-	struct sedf_edom_info *inf = EDOM_INFO(d);
-	inf->exec_domain = d;
-	
-	PRINT(2,"sedf_add_task was called, domain-id %i.%i\n",d->domain->domain_id,
-	      d->vcpu_id);
-	      
-	if (d->domain->domain_id==0) {
-		/*set dom0 to something useful to boot the machine*/
-		inf->period    = MILLISECS(20);
-		inf->slice     = MILLISECS(15);
-		inf->latency   = 0;
-		inf->deadl_abs = 0;
-		inf->status     = EXTRA_NONE | SEDF_ASLEEP;/*EXTRA_AWARE; */
-	}
-	else {
-		/*other domains run in best effort mode*/
-		inf->period    = WEIGHT_PERIOD;
-		inf->slice     = 0;
-		inf->deadl_abs = 0;
-		inf->latency   = 0;
-		inf->status     = EXTRA_AWARE | SEDF_ASLEEP;
-		inf->extraweight = 1;
-	}
-	inf->period_orig = inf->period; inf->slice_orig = inf->slice;
-	INIT_LIST_HEAD(&(inf->list));
-	INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q]));
-	INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q]));
-	
-	if (d->domain->domain_id != IDLE_DOMAIN_ID) {
-		extraq_check(d);
-	}
+    struct sedf_edom_info *inf = EDOM_INFO(d);
+    inf->exec_domain = d;
+ 
+    PRINT(2,"sedf_add_task was called, domain-id %i.%i\n",d->domain->domain_id,
+          d->vcpu_id);
+
+    /* Allocate per-CPU context if this is the first domain to be added. */
+    if ( schedule_data[d->processor].sched_priv == NULL )
+    {
+        schedule_data[d->processor].sched_priv = 
+            xmalloc(struct sedf_cpu_info);
+        BUG_ON(schedule_data[d->processor].sched_priv == NULL);
+        INIT_LIST_HEAD(WAITQ(d->processor));
+        INIT_LIST_HEAD(RUNQ(d->processor));
+        INIT_LIST_HEAD(EXTRAQ(d->processor,EXTRA_PEN_Q));
+        INIT_LIST_HEAD(EXTRAQ(d->processor,EXTRA_UTIL_Q));
+    }
+       
+    if (d->domain->domain_id==0) {
+        /*set dom0 to something useful to boot the machine*/
+        inf->period    = MILLISECS(20);
+        inf->slice     = MILLISECS(15);
+        inf->latency   = 0;
+        inf->deadl_abs = 0;
+        inf->status     = EXTRA_NONE | SEDF_ASLEEP;/*EXTRA_AWARE; */
+    } else {
+        /*other domains run in best effort mode*/
+        inf->period    = WEIGHT_PERIOD;
+        inf->slice     = 0;
+        inf->deadl_abs = 0;
+        inf->latency   = 0;
+        inf->status     = EXTRA_AWARE | SEDF_ASLEEP;
+        inf->extraweight = 1;
+    }
+    inf->period_orig = inf->period; inf->slice_orig = inf->slice;
+    INIT_LIST_HEAD(&(inf->list));
+    INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q]));
+    INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q]));
+ 
+    if (!is_idle_task(d->domain)) {
+        extraq_check(d);
+    } else {
+        EDOM_INFO(d)->deadl_abs = 0;
+        EDOM_INFO(d)->status &= ~SEDF_ASLEEP;
+    }
 }
 
 /* Frees memory used by domain info */
 static void sedf_free_task(struct domain *d)
 {
-	int i;
-	PRINT(2,"sedf_free_task was called, domain-id %i\n",d->domain_id);
-	ASSERT(d->sched_priv != NULL);
-	xfree(d->sched_priv);
-	
-	for (i = 0; i < MAX_VIRT_CPUS; i++)
-        	if ( d->exec_domain[i] ) {
-			ASSERT(d->exec_domain[i]->sched_priv != NULL);
-			xfree(d->exec_domain[i]->sched_priv);
-		}
-}
-
-/* Initialises idle task */
-static int sedf_init_idle_task(struct exec_domain *d) {
-	PRINT(2,"sedf_init_idle_task was called, domain-id %i.%i\n",
-	      d->domain->domain_id, d->vcpu_id);
-	if ( sedf_alloc_task(d) < 0 )
-		return -1;
-	
-	sedf_add_task(d);
-	EDOM_INFO(d)->deadl_abs = 0;
-	EDOM_INFO(d)->status &= ~SEDF_ASLEEP;
-	set_bit(_VCPUF_running, &d->vcpu_flags);
-	/*the idle task doesn't have to turn up on any list...*/
-	return 0;
+    int i;
+    PRINT(2,"sedf_free_task was called, domain-id %i\n",d->domain_id);
+    ASSERT(d->sched_priv != NULL);
+    xfree(d->sched_priv);
+ 
+    for (i = 0; i < MAX_VIRT_CPUS; i++)
+        if ( d->exec_domain[i] ) {
+            ASSERT(d->exec_domain[i]->sched_priv != NULL);
+            xfree(d->exec_domain[i]->sched_priv);
+        }
 }
 
 /* handles the rescheduling, bookkeeping of domains running in their realtime-time :)*/
 static inline void desched_edf_dom (s_time_t now, struct exec_domain* d) {
-	struct sedf_edom_info* inf = EDOM_INFO(d);
-	/*current domain is running in real time mode*/
-	
-	ASSERT(__task_on_queue(d));
-	/*update the domains cputime*/
-	inf->cputime += now - inf->sched_start_abs;
+    struct sedf_edom_info* inf = EDOM_INFO(d);
+    /*current domain is running in real time mode*/
+ 
+    ASSERT(__task_on_queue(d));
+    /*update the domains cputime*/
+    inf->cputime += now - inf->sched_start_abs;
 
-	/*scheduling decisions, which don't remove the running domain
-	  from the runq*/
-	if ((inf->cputime < inf->slice) && sedf_runnable(d))
-		return;
-		
-	__del_from_queue(d);
-		
-	/*manage bookkeeping (i.e. calculate next deadline,
-	  memorize overun-time of slice) of finished domains*/
-	if (inf->cputime >= inf->slice) {
-		inf->cputime -= inf->slice;
-		
-		if (inf->period < inf->period_orig) {
-			/*this domain runs in latency scaling or burst mode*/
-			#if (UNBLOCK == UNBLOCK_BURST)
-			/*if we are runnig in burst scaling wait for two periods
-			  before scaling periods up again*/ 
-			if (now - inf->unblock_abs >= 2 * inf->period)
-			#endif
-			{
-				inf->period *= 2; inf->slice *= 2;
-				if ((inf->period > inf->period_orig) ||
-				    (inf->slice > inf->slice_orig)) {
-					/*reset slice & period*/
-					inf->period = inf->period_orig;
-					inf->slice = inf->slice_orig;
-				}
-			}
-		}
-		/*set next deadline*/
-		inf->deadl_abs += inf->period;
-	}
-	
-	/*add a runnable domain to the waitqueue*/
-	if (sedf_runnable(d))
-		__add_to_waitqueue_sort(d);
-	else {
-		/*we have a blocked realtime task -> remove it from exqs too*/
-		#if (EXTRA > EXTRA_OFF)
-		#if (EXTRA == EXTRA_BLOCK_WEIGHT)
-		if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q);
-		#endif
-		if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q);
-		#endif
-	}
-	ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
-	ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
-	  sedf_runnable(d)));
+    /*scheduling decisions, which don't remove the running domain
+      from the runq*/
+    if ((inf->cputime < inf->slice) && sedf_runnable(d))
+        return;
+  
+    __del_from_queue(d);
+  
+    /*manage bookkeeping (i.e. calculate next deadline,
+      memorize overun-time of slice) of finished domains*/
+    if (inf->cputime >= inf->slice) {
+        inf->cputime -= inf->slice;
+  
+        if (inf->period < inf->period_orig) {
+            /*this domain runs in latency scaling or burst mode*/
+#if (UNBLOCK == UNBLOCK_BURST)
+            /*if we are runnig in burst scaling wait for two periods
+              before scaling periods up again*/ 
+            if (now - inf->unblock_abs >= 2 * inf->period)
+#endif
+            {
+                inf->period *= 2; inf->slice *= 2;
+                if ((inf->period > inf->period_orig) ||
+                    (inf->slice > inf->slice_orig)) {
+                    /*reset slice & period*/
+                    inf->period = inf->period_orig;
+                    inf->slice = inf->slice_orig;
+                }
+            }
+        }
+        /*set next deadline*/
+        inf->deadl_abs += inf->period;
+    }
+ 
+    /*add a runnable domain to the waitqueue*/
+    if (sedf_runnable(d))
+        __add_to_waitqueue_sort(d);
+    else {
+        /*we have a blocked realtime task -> remove it from exqs too*/
+#if (EXTRA > EXTRA_OFF)
+#if (EXTRA == EXTRA_BLOCK_WEIGHT)
+        if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q);
+#endif
+        if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q);
+#endif
+    }
+    ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
+    ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
+                 sedf_runnable(d)));
 }
 
 /* Update all elements on the queues */
 static inline void update_queues(s_time_t now, struct list_head* runq, 
-struct list_head* waitq) {
-	struct list_head     *cur,*tmp;
-	struct sedf_edom_info *curinf;
-	
-	PRINT(3,"Updating waitq..\n");
-	/*check for the first elements of the waitqueue, whether their
-	  next period has already started*/
-	list_for_each_safe(cur, tmp, waitq) {
-		curinf = list_entry(cur, struct sedf_edom_info, list);
-		PRINT(4,"\tLooking @ dom %i.%i\n",
-		      curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id);
-		if (PERIOD_BEGIN(curinf) <= now) {
-			__del_from_queue(curinf->exec_domain);
-			__add_to_runqueue_sort(curinf->exec_domain);
-		}
-		else
-			break;
-	}
-	
-	PRINT(3,"Updating runq..\n");
-	/*process the runq, find domains that are on
-	  the runqueue which shouldn't be there*/
-	list_for_each_safe(cur, tmp, runq) {
-		curinf = list_entry(cur,struct sedf_edom_info,list);
-		PRINT(4,"\tLooking @ dom %i.%i\n",
-		      curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id);
-		if (unlikely(curinf->slice == 0)) {
-			/*ignore domains with empty slice*/
-			PRINT(4,"\tUpdating zero-slice domain %i.%i\n",
-			      curinf->exec_domain->domain->domain_id,
-			      curinf->exec_domain->vcpu_id);
-			__del_from_queue(curinf->exec_domain);
-			
-			/*move them to their next period*/
-			curinf->deadl_abs += curinf->period;
-			/*and put them back into the queue*/
-			__add_to_waitqueue_sort(curinf->exec_domain);
-			continue;
-		}
-		if (unlikely((curinf->deadl_abs < now) ||
-			(curinf->cputime > curinf->slice))) {
-			/*we missed the deadline or the slice was
-				already finished... might hapen because
-				of dom_adj.*/
-			PRINT(4,"\tDomain %i.%i exceeded it's deadline/"
-				"slice (%"PRIu64" / %"PRIu64") now: %"PRIu64
-				" cputime: %"PRIu64"\n",
-				curinf->exec_domain->domain->domain_id,
-				curinf->exec_domain->vcpu_id,
-				curinf->deadl_abs, curinf->slice, now,
-				curinf->cputime);
-			__del_from_queue(curinf->exec_domain);
-			/*common case: we miss one period!*/
-			curinf->deadl_abs += curinf->period;
-			
-			/*if we are still behind: modulo arithmetic,
-				force deadline to be in future and
-				aligned to period borders!*/
-			if (unlikely(curinf->deadl_abs < now))
-				curinf->deadl_abs += 
-					DIV_UP(now - curinf->deadl_abs,
-					curinf->period) * curinf->period;
-			ASSERT(curinf->deadl_abs > now);
-			/*give a fresh slice*/
-			curinf->cputime = 0;
-			if (PERIOD_BEGIN(curinf) > now)
-				__add_to_waitqueue_sort(curinf->exec_domain);
-			else
-				__add_to_runqueue_sort(curinf->exec_domain);
-		}
-		else
-			break;
-	}
-	PRINT(3,"done updating the queues\n");
+                                 struct list_head* waitq) {
+    struct list_head     *cur,*tmp;
+    struct sedf_edom_info *curinf;
+ 
+    PRINT(3,"Updating waitq..\n");
+    /*check for the first elements of the waitqueue, whether their
+      next period has already started*/
+    list_for_each_safe(cur, tmp, waitq) {
+        curinf = list_entry(cur, struct sedf_edom_info, list);
+        PRINT(4,"\tLooking @ dom %i.%i\n",
+              curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id);
+        if (PERIOD_BEGIN(curinf) <= now) {
+            __del_from_queue(curinf->exec_domain);
+            __add_to_runqueue_sort(curinf->exec_domain);
+        }
+        else
+            break;
+    }
+ 
+    PRINT(3,"Updating runq..\n");
+    /*process the runq, find domains that are on
+      the runqueue which shouldn't be there*/
+    list_for_each_safe(cur, tmp, runq) {
+        curinf = list_entry(cur,struct sedf_edom_info,list);
+        PRINT(4,"\tLooking @ dom %i.%i\n",
+              curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id);
+        if (unlikely(curinf->slice == 0)) {
+            /*ignore domains with empty slice*/
+            PRINT(4,"\tUpdating zero-slice domain %i.%i\n",
+                  curinf->exec_domain->domain->domain_id,
+                  curinf->exec_domain->vcpu_id);
+            __del_from_queue(curinf->exec_domain);
+   
+            /*move them to their next period*/
+            curinf->deadl_abs += curinf->period;
+            /*and put them back into the queue*/
+            __add_to_waitqueue_sort(curinf->exec_domain);
+            continue;
+        }
+        if (unlikely((curinf->deadl_abs < now) ||
+                     (curinf->cputime > curinf->slice))) {
+            /*we missed the deadline or the slice was
+              already finished... might hapen because
+              of dom_adj.*/
+            PRINT(4,"\tDomain %i.%i exceeded it's deadline/"
+                  "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64
+                  " cputime: %"PRIu64"\n",
+                  curinf->exec_domain->domain->domain_id,
+                  curinf->exec_domain->vcpu_id,
+                  curinf->deadl_abs, curinf->slice, now,
+                  curinf->cputime);
+            __del_from_queue(curinf->exec_domain);
+            /*common case: we miss one period!*/
+            curinf->deadl_abs += curinf->period;
+   
+            /*if we are still behind: modulo arithmetic,
+              force deadline to be in future and
+              aligned to period borders!*/
+            if (unlikely(curinf->deadl_abs < now))
+                curinf->deadl_abs += 
+                    DIV_UP(now - curinf->deadl_abs,
+                           curinf->period) * curinf->period;
+            ASSERT(curinf->deadl_abs > now);
+            /*give a fresh slice*/
+            curinf->cputime = 0;
+            if (PERIOD_BEGIN(curinf) > now)
+                __add_to_waitqueue_sort(curinf->exec_domain);
+            else
+                __add_to_runqueue_sort(curinf->exec_domain);
+        }
+        else
+            break;
+    }
+    PRINT(3,"done updating the queues\n");
 }
 
 #if (EXTRA > EXTRA_OFF)
@@ -571,140 +550,140 @@ struct list_head* waitq) {
    if the domain is blocked / has regained its short-block-loss
    time it is not put on any queue */
 static inline void desched_extra_dom(s_time_t now, struct exec_domain* d) {
-	struct sedf_edom_info	*inf = EDOM_INFO(d);
-	int 			i    = extra_get_cur_q(inf);
-	
+    struct sedf_edom_info *inf = EDOM_INFO(d);
+    int    i    = extra_get_cur_q(inf);
+ 
 #if (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT)
-	unsigned long         oldscore;
+    unsigned long         oldscore;
 #endif
-	ASSERT(extraq_on(d, i));
-	/*unset all running flags*/
-	inf->status  &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL);
-	/*fresh slice for the next run*/
-	inf->cputime = 0;
-	/*accumulate total extratime*/
-	inf->extra_time_tot += now - inf->sched_start_abs;
-	/*remove extradomain from head of the queue*/
-	extraq_del(d, i);
+    ASSERT(extraq_on(d, i));
+    /*unset all running flags*/
+    inf->status  &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL);
+    /*fresh slice for the next run*/
+    inf->cputime = 0;
+    /*accumulate total extratime*/
+    inf->extra_time_tot += now - inf->sched_start_abs;
+    /*remove extradomain from head of the queue*/
+    extraq_del(d, i);
 
 #if (EXTRA == EXTRA_ROUNDR)
-	if (sedf_runnable(d) && (inf->status & EXTRA_AWARE))
-		/*add to the tail if it is runnable => round-robin*/
-		extraq_add_tail(d, EXTRA_UTIL_Q);
+    if (sedf_runnable(d) && (inf->status & EXTRA_AWARE))
+        /*add to the tail if it is runnable => round-robin*/
+        extraq_add_tail(d, EXTRA_UTIL_Q);
 #elif (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT)
-	/*update the score*/
-	oldscore      = inf->score[i];
+    /*update the score*/
+    oldscore      = inf->score[i];
 #if (EXTRA == EXTRA_BLOCK_WEIGHT)
-	if (i == EXTRA_PEN_Q) {
-		/*domain was running in L0 extraq*/
-		/*reduce block lost, probably more sophistication here!*/
-		/*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/
-		inf->short_block_lost_tot -= now - inf->sched_start_abs;
-		PRINT(3,"Domain %i.%i: Short_block_loss: %"PRIi64"\n", 
-		      inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id,
-		      inf->short_block_lost_tot);
-		if (inf->short_block_lost_tot <= 0) {
-			PRINT(4,"Domain %i.%i compensated short block loss!\n",
-			  inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id);
-			/*we have (over-)compensated our block penalty*/
-			inf->short_block_lost_tot = 0;
-			/*we don't want a place on the penalty queue anymore!*/
-			inf->status &= ~EXTRA_WANT_PEN_Q;
-			goto check_extra_queues;
-		}
-		/*we have to go again for another try in the block-extraq,
-		  the score is not used incremantally here, as this is
-		  already done by recalculating the block_lost*/
-		inf->score[EXTRA_PEN_Q] = (inf->period << 10) /
-		                          inf->short_block_lost_tot;
-		oldscore = 0;
-	} else
+    if (i == EXTRA_PEN_Q) {
+        /*domain was running in L0 extraq*/
+        /*reduce block lost, probably more sophistication here!*/
+        /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/
+        inf->short_block_lost_tot -= now - inf->sched_start_abs;
+        PRINT(3,"Domain %i.%i: Short_block_loss: %"PRIi64"\n", 
+              inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id,
+              inf->short_block_lost_tot);
+        if (inf->short_block_lost_tot <= 0) {
+            PRINT(4,"Domain %i.%i compensated short block loss!\n",
+                  inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id);
+            /*we have (over-)compensated our block penalty*/
+            inf->short_block_lost_tot = 0;
+            /*we don't want a place on the penalty queue anymore!*/
+            inf->status &= ~EXTRA_WANT_PEN_Q;
+            goto check_extra_queues;
+        }
+        /*we have to go again for another try in the block-extraq,
+          the score is not used incremantally here, as this is
+          already done by recalculating the block_lost*/
+        inf->score[EXTRA_PEN_Q] = (inf->period << 10) /
+            inf->short_block_lost_tot;
+        oldscore = 0;
+    } else
 #endif
-	{
-		/*domain was running in L1 extraq => score is inverse of
-		  utilization and is used somewhat incremental!*/
-		if (!inf->extraweight)
-			/*NB: use fixed point arithmetic with 10 bits*/
-			inf->score[EXTRA_UTIL_Q] = (inf->period << 10) /
-			                            inf->slice;
-		else
-			/*give a domain w/ exweight = 1 as much as a domain with
-			  util = 1/128*/
-			inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight;
-	}
-check_extra_queues:
-	/* Adding a runnable domain to the right queue and removing blocked ones*/
-	if (sedf_runnable(d)) {
-		/*add according to score: weighted round robin*/
-		if (inf->status & (EXTRA_AWARE | EXTRA_WANT_PEN_Q))
-			extraq_add_sort_update(d, i, oldscore);
-	}
-	else {
-		/*remove this blocked domain from the waitq!*/
-		__del_from_queue(d);
+    {
+        /*domain was running in L1 extraq => score is inverse of
+          utilization and is used somewhat incremental!*/
+        if (!inf->extraweight)
+            /*NB: use fixed point arithmetic with 10 bits*/
+            inf->score[EXTRA_UTIL_Q] = (inf->period << 10) /
+                inf->slice;
+        else
+            /*give a domain w/ exweight = 1 as much as a domain with
+              util = 1/128*/
+            inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight;
+    }
+ check_extra_queues:
+    /* Adding a runnable domain to the right queue and removing blocked ones*/
+    if (sedf_runnable(d)) {
+        /*add according to score: weighted round robin*/
+        if (inf->status & (EXTRA_AWARE | EXTRA_WANT_PEN_Q))
+            extraq_add_sort_update(d, i, oldscore);
+    }
+    else {
+        /*remove this blocked domain from the waitq!*/
+        __del_from_queue(d);
 #if (EXTRA == EXTRA_BLOCK_WEIGHT)
-		/*make sure that we remove a blocked domain from the other
-		  extraq too*/
-		if (i == EXTRA_PEN_Q) {
-			if (extraq_on(d, EXTRA_UTIL_Q))
-				extraq_del(d, EXTRA_UTIL_Q);
-		}
-		else {
-			if (extraq_on(d, EXTRA_PEN_Q))
-				extraq_del(d, EXTRA_PEN_Q);
-		}
+        /*make sure that we remove a blocked domain from the other
+          extraq too*/
+        if (i == EXTRA_PEN_Q) {
+            if (extraq_on(d, EXTRA_UTIL_Q))
+                extraq_del(d, EXTRA_UTIL_Q);
+        }
+        else {
+            if (extraq_on(d, EXTRA_PEN_Q))
+                extraq_del(d, EXTRA_PEN_Q);
+        }
 #endif
-	}
+    }
 #endif
-	ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
-	ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
-	  sedf_runnable(d)));
+    ASSERT(EQ(sedf_runnable(d), __task_on_queue(d)));
+    ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), 
+                 sedf_runnable(d)));
 }
 #endif
 
 static inline struct task_slice sedf_do_extra_schedule (s_time_t now,
-    s_time_t end_xt, struct list_head *extraq[], int cpu) {
-	struct task_slice 		ret;
-	struct sedf_edom_info	*runinf;
-	
-	/* Enough time left to use for extratime? */
-	if (end_xt - now < EXTRA_QUANTUM)
-		goto return_idle;
+                                                        s_time_t end_xt, struct list_head *extraq[], int cpu) {
+    struct task_slice   ret;
+    struct sedf_edom_info *runinf;
+ 
+    /* Enough time left to use for extratime? */
+    if (end_xt - now < EXTRA_QUANTUM)
+        goto return_idle;
 #if (EXTRA == EXTRA_BLOCK_WEIGHT)
-	if (!list_empty(extraq[EXTRA_PEN_Q])) {
-		/*we still have elements on the level 0 extraq 
-		  => let those run first!*/
-		runinf   = list_entry(extraq[EXTRA_PEN_Q]->next, 
-		              struct sedf_edom_info, extralist[EXTRA_PEN_Q]);
-		runinf->status |= EXTRA_RUN_PEN;
-		ret.task = runinf->exec_domain;
-		ret.time = EXTRA_QUANTUM;
+    if (!list_empty(extraq[EXTRA_PEN_Q])) {
+        /*we still have elements on the level 0 extraq 
+          => let those run first!*/
+        runinf   = list_entry(extraq[EXTRA_PEN_Q]->next, 
+                              struct sedf_edom_info, extralist[EXTRA_PEN_Q]);
+        runinf->status |= EXTRA_RUN_PEN;
+        ret.task = runinf->exec_domain;
+        ret.time = EXTRA_QUANTUM;
 #ifdef SEDF_STATS
-		runinf->pen_extra_slices++;
+        runinf->pen_extra_slices++;
 #endif
-	} else
+    } else
 #endif
-	if (!list_empty(extraq[EXTRA_UTIL_Q])) {
-		/*use elements from the normal extraqueue*/
-		runinf   = list_entry(extraq[EXTRA_UTIL_Q]->next,
-		              struct sedf_edom_info, extralist[EXTRA_UTIL_Q]);
-		runinf->status |= EXTRA_RUN_UTIL;
-		ret.task = runinf->exec_domain;
-		ret.time = EXTRA_QUANTUM;
-	}
-	else
-		goto return_idle;
+        if (!list_empty(extraq[EXTRA_UTIL_Q])) {
+            /*use elements from the normal extraqueue*/
+            runinf   = list_entry(extraq[EXTRA_UTIL_Q]->next,
+                                  struct sedf_edom_info, extralist[EXTRA_UTIL_Q]);
+            runinf->status |= EXTRA_RUN_UTIL;
+            ret.task = runinf->exec_domain;
+            ret.time = EXTRA_QUANTUM;
+        }
+        else
+            goto return_idle;
 
-	ASSERT(ret.time > 0);
-	ASSERT(sedf_runnable(ret.task));
-	return ret;
-	
-return_idle:
-	ret.task = IDLETASK(cpu);
-	ret.time = end_xt - now;
-	ASSERT(ret.time > 0);
-	ASSERT(sedf_runnable(ret.task));
-	return ret;
+    ASSERT(ret.time > 0);
+    ASSERT(sedf_runnable(ret.task));
+    return ret;
+ 
+ return_idle:
+    ret.task = IDLETASK(cpu);
+    ret.time = end_xt - now;
+    ASSERT(ret.time > 0);
+    ASSERT(sedf_runnable(ret.task));
+    return ret;
 }
 /* Main scheduling function
    Reasons for calling this function are:
@@ -713,126 +692,123 @@ return_idle:
    -and various others ;) in general: determine which domain to run next*/
 static struct task_slice sedf_do_schedule(s_time_t now)
 {
-	int                   cpu      = current->processor;
-	struct list_head     *runq     = RUNQ(cpu);
-	struct list_head     *waitq    = WAITQ(cpu);
-	#if (EXTRA > EXTRA_OFF)
-	struct sedf_edom_info *inf     = EDOM_INFO(current);
-	struct list_head     *extraq[] = {EXTRAQ(cpu, EXTRA_PEN_Q),
-	                                  EXTRAQ(cpu, EXTRA_UTIL_Q)};
-	#endif
-	struct task_slice          ret;
-	/*int i = 0;*/
-	/*idle tasks don't need any of the following stuf*/
-	if (is_idle_task(current->domain))
-		goto check_waitq;
-	
-	/* create local state of the status of the domain, in order to avoid
-	   inconsistent state during scheduling decisions, because data for
-	   domain_runnable is not protected by the scheduling lock!*/
-	if(!domain_runnable(current))
-		inf->status |= SEDF_ASLEEP;
-	
-	if (inf->status & SEDF_ASLEEP)
-		inf->block_abs = now;
+    int                   cpu      = current->processor;
+    struct list_head     *runq     = RUNQ(cpu);
+    struct list_head     *waitq    = WAITQ(cpu);
+#if (EXTRA > EXTRA_OFF)
+    struct sedf_edom_info *inf     = EDOM_INFO(current);
+    struct list_head     *extraq[] = {EXTRAQ(cpu, EXTRA_PEN_Q),
+                                      EXTRAQ(cpu, EXTRA_UTIL_Q)};
+#endif
+    struct task_slice          ret;
+    /*int i = 0;*/
+    /*idle tasks don't need any of the following stuf*/
+    if (is_idle_task(current->domain))
+        goto check_waitq;
+ 
+    /* create local state of the status of the domain, in order to avoid
+       inconsistent state during scheduling decisions, because data for
+       domain_runnable is not protected by the scheduling lock!*/
+    if(!domain_runnable(current))
+        inf->status |= SEDF_ASLEEP;
+ 
+    if (inf->status & SEDF_ASLEEP)
+        inf->block_abs = now;
 
-	#if (EXTRA > EXTRA_OFF)
-	if (unlikely(extra_runs(inf))) {
-		/*special treatment of domains running in extra time*/
-		desched_extra_dom(now, current);
-	}
-	else 
-	#endif
-	{
-		desched_edf_dom(now, current);
-	}
-check_waitq:
-	update_queues(now, runq, waitq);
-	
-	/*now simply pick the first domain from the runqueue, which has the
-	  earliest deadline, because the list is sorted*/
-	struct sedf_edom_info *runinf, *waitinf;
-	
-	if (!list_empty(runq)) {
-		runinf   = list_entry(runq->next,struct sedf_edom_info,list);
-		ret.task = runinf->exec_domain;
-		if (!list_empty(waitq)) {
-			waitinf  = list_entry(waitq->next,
-			               struct sedf_edom_info,list);
-			/*rerun scheduler, when scheduled domain reaches it's
-			  end of slice or the first domain from the waitqueue
-			  gets ready*/
-			ret.time = MIN(now + runinf->slice - runinf->cputime,
-			               PERIOD_BEGIN(waitinf)) - now;
-		}
-		else {
-			ret.time = runinf->slice - runinf->cputime;
-		}
-		CHECK(ret.time > 0);
-		goto sched_done;
-	}
-	
-	if (!list_empty(waitq)) {
-		waitinf  = list_entry(waitq->next,struct sedf_edom_info, list);
-		/*we could not find any suitable domain 
-		  => look for domains that are aware of extratime*/
-		#if (EXTRA > EXTRA_OFF)
-		ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf),
-		                             extraq, cpu);
-		#else
-		ret.task = IDLETASK(cpu);
-		ret.time = PERIOD_BEGIN(waitinf) - now;
-		#endif
-		CHECK(ret.time > 0);
-	}
-	else {
-		/*this could probably never happen, but one never knows...*/
-		/*it can... imagine a second CPU, which is pure scifi ATM,
-		  but one never knows ;)*/
-		ret.task = IDLETASK(cpu);
-		ret.time = SECONDS(1);
-	}
+#if (EXTRA > EXTRA_OFF)
+    if (unlikely(extra_runs(inf))) {
+        /*special treatment of domains running in extra time*/
+        desched_extra_dom(now, current);
+    }
+    else 
+#endif
+    {
+        desched_edf_dom(now, current);
+    }
+ check_waitq:
+    update_queues(now, runq, waitq);
+ 
+    /*now simply pick the first domain from the runqueue, which has the
+      earliest deadline, because the list is sorted*/
+    struct sedf_edom_info *runinf, *waitinf;
+ 
+    if (!list_empty(runq)) {
+        runinf   = list_entry(runq->next,struct sedf_edom_info,list);
+        ret.task = runinf->exec_domain;
+        if (!list_empty(waitq)) {
+            waitinf  = list_entry(waitq->next,
+                                  struct sedf_edom_info,list);
+            /*rerun scheduler, when scheduled domain reaches it's
+              end of slice or the first domain from the waitqueue
+              gets ready*/
+            ret.time = MIN(now + runinf->slice - runinf->cputime,
+                           PERIOD_BEGIN(waitinf)) - now;
+        }
+        else {
+            ret.time = runinf->slice - runinf->cputime;
+        }
+        CHECK(ret.time > 0);
+        goto sched_done;
+    }
+ 
+    if (!list_empty(waitq)) {
+        waitinf  = list_entry(waitq->next,struct sedf_edom_info, list);
+        /*we could not find any suitable domain 
+          => look for domains that are aware of extratime*/
+#if (EXTRA > EXTRA_OFF)
+        ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf),
+                                     extraq, cpu);
+#else
+        ret.task = IDLETASK(cpu);
+        ret.time = PERIOD_BEGIN(waitinf) - now;
+#endif
+        CHECK(ret.time > 0);
+    }
+    else {
+        /*this could probably never happen, but one never knows...*/
+        /*it can... imagine a second CPU, which is pure scifi ATM,
+          but one never knows ;)*/
+        ret.task = IDLETASK(cpu);
+        ret.time = SECONDS(1);
+    }
 
-sched_done:	
-	/*TODO: Do something USEFUL when this happens and find out, why it
-	still can happen!!!*/
-	if (ret.time<0) {
-		printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n",
-		       ret.time);
-		ret.time = EXTRA_QUANTUM;
-	}
-	EDOM_INFO(ret.task)->sched_start_abs = now;
-	CHECK(ret.time > 0);
-	ASSERT(sedf_runnable(ret.task));
-	return ret;
+ sched_done: 
+    /*TODO: Do something USEFUL when this happens and find out, why it
+      still can happen!!!*/
+    if (ret.time<0) {
+        printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n",
+               ret.time);
+        ret.time = EXTRA_QUANTUM;
+    }
+    EDOM_INFO(ret.task)->sched_start_abs = now;
+    CHECK(ret.time > 0);
+    ASSERT(sedf_runnable(ret.task));
+    return ret;
 }
 
 static void sedf_sleep(struct exec_domain *d) {
-	PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id);
-	
-	if (is_idle_task(d->domain))
-		return;
+    PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id);
+ 
+    if (is_idle_task(d->domain))
+        return;
 
-	EDOM_INFO(d)->status |= SEDF_ASLEEP;
-	
-	if ( test_bit(_VCPUF_running, &d->vcpu_flags) ) {
-#ifdef ADV_SCHED_HISTO
-		adv_sched_hist_start(d->processor);
+    EDOM_INFO(d)->status |= SEDF_ASLEEP;
+ 
+    if ( test_bit(_VCPUF_running, &d->vcpu_flags) ) {
+        cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
+    }
+    else  {
+        if ( __task_on_queue(d) )
+            __del_from_queue(d);
+#if (EXTRA > EXTRA_OFF)
+        if (extraq_on(d, EXTRA_UTIL_Q)) 
+            extraq_del(d, EXTRA_UTIL_Q);
+#endif
+#if (EXTRA == EXTRA_BLOCK_WEIGHT)
+        if (extraq_on(d, EXTRA_PEN_Q))
+            extraq_del(d, EXTRA_PEN_Q);
 #endif
-		cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
-	}
-	else  {
-		if ( __task_on_queue(d) )
-			__del_from_queue(d);
-		#if (EXTRA > EXTRA_OFF)
-		if (extraq_on(d, EXTRA_UTIL_Q)) 
-			extraq_del(d, EXTRA_UTIL_Q);
-		#endif
-		#if (EXTRA == EXTRA_BLOCK_WEIGHT)
-		if (extraq_on(d, EXTRA_PEN_Q))
-			extraq_del(d, EXTRA_PEN_Q);
-		#endif
-	}
+    }
 }
 
 /* This function wakes up a domain, i.e. moves them into the waitqueue
@@ -908,555 +884,554 @@ static void sedf_sleep(struct exec_domain *d) {
  */
 static inline void unblock_short_vcons
 (struct sedf_edom_info* inf, s_time_t now) {
-	inf->deadl_abs += inf->period;
-	inf->cputime = 0;
+    inf->deadl_abs += inf->period;
+    inf->cputime = 0;
 }
 
 static inline void unblock_short_cons(struct sedf_edom_info* inf, s_time_t now)
 {
-	/*treat blocked time as consumed by the domain*/
-	inf->cputime += now - inf->block_abs;	
-	if (inf->cputime + EXTRA_QUANTUM > inf->slice) {
-		/*we don't have a reasonable amount of time in 
-		  our slice left :( => start in next period!*/
-		unblock_short_vcons(inf, now);
-	}
+    /*treat blocked time as consumed by the domain*/
+    inf->cputime += now - inf->block_abs; 
+    if (inf->cputime + EXTRA_QUANTUM > inf->slice) {
+        /*we don't have a reasonable amount of time in 
+          our slice left :( => start in next period!*/
+        unblock_short_vcons(inf, now);
+    }
 #ifdef SEDF_STATS
-	else
-		inf->short_cont++;
+    else
+        inf->short_cont++;
 #endif
 }
 static inline void unblock_short_extra_support (struct sedf_edom_info* inf,
-   s_time_t now) {
-	/*this unblocking scheme tries to support the domain, by assigning it
-	   a priority in extratime distribution according to the loss of time
-	   in this slice due to blocking*/
-	s_time_t pen;
-	
-	/*no more realtime execution in this period!*/
-	inf->deadl_abs += inf->period;
-	if (likely(inf->block_abs)) {
-		//treat blocked time as consumed by the domain*/
-		/*inf->cputime += now - inf->block_abs;*/
-		/*penalty is time the domain would have
-		  had if it continued to run */
-		pen = (inf->slice - inf->cputime);
-		if (pen < 0) pen = 0;
-		/*accumulate all penalties over the periods*/
-		/*inf->short_block_lost_tot += pen;*/
-		/*set penalty to the current value*/
-		inf->short_block_lost_tot = pen;
-		/*not sure which one is better.. but seems to work well...*/
-		
-		if (inf->short_block_lost_tot) {
-			inf->score[0] = (inf->period << 10) /
-			                 inf->short_block_lost_tot;
+                                                s_time_t now) {
+    /*this unblocking scheme tries to support the domain, by assigning it
+    a priority in extratime distribution according to the loss of time
+    in this slice due to blocking*/
+    s_time_t pen;
+ 
+    /*no more realtime execution in this period!*/
+    inf->deadl_abs += inf->period;
+    if (likely(inf->block_abs)) {
+        //treat blocked time as consumed by the domain*/
+        /*inf->cputime += now - inf->block_abs;*/
+        /*penalty is time the domain would have
+          had if it continued to run */
+        pen = (inf->slice - inf->cputime);
+        if (pen < 0) pen = 0;
+        /*accumulate all penalties over the periods*/
+        /*inf->short_block_lost_tot += pen;*/
+        /*set penalty to the current value*/
+        inf->short_block_lost_tot = pen;
+        /*not sure which one is better.. but seems to work well...*/
+  
+        if (inf->short_block_lost_tot) {
+            inf->score[0] = (inf->period << 10) /
+                inf->short_block_lost_tot;
 #ifdef SEDF_STATS
-			inf->pen_extra_blocks++;
+            inf->pen_extra_blocks++;
 #endif
-			if (extraq_on(inf->exec_domain, EXTRA_PEN_Q))
-				/*remove domain for possible resorting!*/
-				extraq_del(inf->exec_domain, EXTRA_PEN_Q);
-			else
-				/*remember that we want to be on the penalty q
-				  so that we can continue when we (un-)block
-				  in penalty-extratime*/
-				inf->status |= EXTRA_WANT_PEN_Q;
-			
-			/*(re-)add domain to the penalty extraq*/
-			extraq_add_sort_update(inf->exec_domain,
-					 EXTRA_PEN_Q, 0);
-		}
-	}
-	/*give it a fresh slice in the next period!*/
-	inf->cputime = 0;
+            if (extraq_on(inf->exec_domain, EXTRA_PEN_Q))
+                /*remove domain for possible resorting!*/
+                extraq_del(inf->exec_domain, EXTRA_PEN_Q);
+            else
+                /*remember that we want to be on the penalty q
+                  so that we can continue when we (un-)block
+                  in penalty-extratime*/
+                inf->status |= EXTRA_WANT_PEN_Q;
+   
+            /*(re-)add domain to the penalty extraq*/
+            extraq_add_sort_update(inf->exec_domain,
+                                   EXTRA_PEN_Q, 0);
+        }
+    }
+    /*give it a fresh slice in the next period!*/
+    inf->cputime = 0;
 }
 static inline void unblock_long_vcons(struct sedf_edom_info* inf, s_time_t now)
 {
-	/* align to next future period */
-	inf->deadl_abs += (DIV_UP(now - inf->deadl_abs, inf->period) +1)
-	                 * inf->period;
-	inf->cputime = 0;
+    /* align to next future period */
+    inf->deadl_abs += (DIV_UP(now - inf->deadl_abs, inf->period) +1)
+        * inf->period;
+    inf->cputime = 0;
 }
 
 static inline void unblock_long_cons_a (struct sedf_edom_info* inf,
-   s_time_t now) {
-	/*treat the time the domain was blocked in the
-	  CURRENT period as consumed by the domain*/
-	inf->cputime = (now - inf->deadl_abs) % inf->period;	
-	if (inf->cputime + EXTRA_QUANTUM > inf->slice) {
-		/*we don't have a reasonable amount of time in our slice
-		  left :( => start in next period!*/
-		unblock_long_vcons(inf, now);
-	}
+                                        s_time_t now) {
+    /*treat the time the domain was blocked in the
+   CURRENT period as consumed by the domain*/
+    inf->cputime = (now - inf->deadl_abs) % inf->period; 
+    if (inf->cputime + EXTRA_QUANTUM > inf->slice) {
+        /*we don't have a reasonable amount of time in our slice
+          left :( => start in next period!*/
+        unblock_long_vcons(inf, now);
+    }
 }
 static inline void unblock_long_cons_b(struct sedf_edom_info* inf,s_time_t now) {
-	/*Conservative 2b*/
-	/*Treat the unblocking time as a start of a new period */
-	inf->deadl_abs = now + inf->period;
-	inf->cputime = 0;
+    /*Conservative 2b*/
+    /*Treat the unblocking time as a start of a new period */
+    inf->deadl_abs = now + inf->period;
+    inf->cputime = 0;
 }
 static inline void unblock_long_cons_c(struct sedf_edom_info* inf,s_time_t now) {
-	if (likely(inf->latency)) {
-		/*scale the slice and period accordingly to the latency hint*/
-		/*reduce period temporarily to the latency hint*/
-		inf->period = inf->latency;
-		/*this results in max. 4s slice/period length*/
-		ASSERT((inf->period < ULONG_MAX)
-		    && (inf->slice_orig < ULONG_MAX));
-		/*scale slice accordingly, so that utilisation stays the same*/
-		inf->slice = (inf->period * inf->slice_orig)
-		            / inf->period_orig;
-		inf->deadl_abs = now + inf->period;
-		inf->cputime = 0;
-	}	
-	else {
-		/*we don't have a latency hint.. use some other technique*/
-		unblock_long_cons_b(inf, now);
-	}
+    if (likely(inf->latency)) {
+        /*scale the slice and period accordingly to the latency hint*/
+        /*reduce period temporarily to the latency hint*/
+        inf->period = inf->latency;
+        /*this results in max. 4s slice/period length*/
+        ASSERT((inf->period < ULONG_MAX)
+               && (inf->slice_orig < ULONG_MAX));
+        /*scale slice accordingly, so that utilisation stays the same*/
+        inf->slice = (inf->period * inf->slice_orig)
+            / inf->period_orig;
+        inf->deadl_abs = now + inf->period;
+        inf->cputime = 0;
+    } 
+    else {
+        /*we don't have a latency hint.. use some other technique*/
+        unblock_long_cons_b(inf, now);
+    }
 }
 /*a new idea of dealing with short blocks: burst period scaling*/
 static inline void unblock_short_burst(struct sedf_edom_info* inf, s_time_t now)
 {
-	/*treat blocked time as consumed by the domain*/
-	inf->cputime += now - inf->block_abs;
-	
-	if (inf->cputime + EXTRA_QUANTUM <= inf->slice) {
-		/*if we can still use some time in the current slice
-		  then use it!*/
+    /*treat blocked time as consumed by the domain*/
+    inf->cputime += now - inf->block_abs;
+ 
+    if (inf->cputime + EXTRA_QUANTUM <= inf->slice) {
+        /*if we can still use some time in the current slice
+          then use it!*/
 #ifdef SEDF_STATS
-		/*we let the domain run in the current period*/
-		inf->short_cont++;
+        /*we let the domain run in the current period*/
+        inf->short_cont++;
 #endif
-	}
-	else {
-		/*we don't have a reasonable amount of time in
-		  our slice left => switch to burst mode*/
-		if (likely(inf->unblock_abs)) {
-			/*set the period-length to the current blocking
-			  interval, possible enhancements: average over last
-			  blocking intervals, user-specified minimum,...*/
-			inf->period = now - inf->unblock_abs;
-			/*check for overflow on multiplication*/
-			ASSERT((inf->period < ULONG_MAX) 
-			    && (inf->slice_orig < ULONG_MAX));
-			/*scale slice accordingly, so that utilisation
-			  stays the same*/
-			inf->slice = (inf->period * inf->slice_orig)
-			            / inf->period_orig;
-			/*set new (shorter) deadline*/
-			inf->deadl_abs += inf->period;
-		}
-		else {
-			/*in case we haven't unblocked before
-			  start in next period!*/
-			inf->cputime=0;
-			inf->deadl_abs += inf->period;
-		}
-	}
-	inf->unblock_abs = now;
+    }
+    else {
+        /*we don't have a reasonable amount of time in
+          our slice left => switch to burst mode*/
+        if (likely(inf->unblock_abs)) {
+            /*set the period-length to the current blocking
+              interval, possible enhancements: average over last
+              blocking intervals, user-specified minimum,...*/
+            inf->period = now - inf->unblock_abs;
+            /*check for overflow on multiplication*/
+            ASSERT((inf->period < ULONG_MAX) 
+                   && (inf->slice_orig < ULONG_MAX));
+            /*scale slice accordingly, so that utilisation
+              stays the same*/
+            inf->slice = (inf->period * inf->slice_orig)
+                / inf->period_orig;
+            /*set new (shorter) deadline*/
+            inf->deadl_abs += inf->period;
+        }
+        else {
+            /*in case we haven't unblocked before
+              start in next period!*/
+            inf->cputime=0;
+            inf->deadl_abs += inf->period;
+        }
+    }
+    inf->unblock_abs = now;
 }
 static inline void unblock_long_burst(struct sedf_edom_info* inf, s_time_t now) {
-	if (unlikely(inf->latency && (inf->period > inf->latency))) {
-		/*scale the slice and period accordingly to the latency hint*/
-		inf->period = inf->latency;
-		/*check for overflows on multiplication*/
-		ASSERT((inf->period < ULONG_MAX)
-		    && (inf->slice_orig < ULONG_MAX));
-		/*scale slice accordingly, so that utilisation stays the same*/
-		inf->slice = (inf->period * inf->slice_orig)
-		            / inf->period_orig;
-		inf->deadl_abs = now + inf->period;
-		inf->cputime = 0;
-	}
-	else {
-		/*we don't have a latency hint.. or we are currently in 
-		 "burst mode": use some other technique
-		  NB: this should be in fact the normal way of operation,
-		  when we are in sync with the device!*/
-		unblock_long_cons_b(inf, now);
-	}
-	inf->unblock_abs = now;
+    if (unlikely(inf->latency && (inf->period > inf->latency))) {
+        /*scale the slice and period accordingly to the latency hint*/
+        inf->period = inf->latency;
+        /*check for overflows on multiplication*/
+        ASSERT((inf->period < ULONG_MAX)
+               && (inf->slice_orig < ULONG_MAX));
+        /*scale slice accordingly, so that utilisation stays the same*/
+        inf->slice = (inf->period * inf->slice_orig)
+            / inf->period_orig;
+        inf->deadl_abs = now + inf->period;
+        inf->cputime = 0;
+    }
+    else {
+        /*we don't have a latency hint.. or we are currently in 
+          "burst mode": use some other technique
+          NB: this should be in fact the normal way of operation,
+          when we are in sync with the device!*/
+        unblock_long_cons_b(inf, now);
+    }
+    inf->unblock_abs = now;
 }
 
-#define DOMAIN_EDF 		1
-#define DOMAIN_EXTRA_PEN 	2
-#define DOMAIN_EXTRA_UTIL 	3
-#define DOMAIN_IDLE 		4
+#define DOMAIN_EDF   1
+#define DOMAIN_EXTRA_PEN  2
+#define DOMAIN_EXTRA_UTIL  3
+#define DOMAIN_IDLE   4
 static inline int get_run_type(struct exec_domain* d) {
-	struct sedf_edom_info* inf = EDOM_INFO(d);
-	if (is_idle_task(d->domain))
-		return DOMAIN_IDLE;
-	if (inf->status & EXTRA_RUN_PEN)
-		return DOMAIN_EXTRA_PEN;
-	if (inf->status & EXTRA_RUN_UTIL)
-		return DOMAIN_EXTRA_UTIL;
-	return DOMAIN_EDF;
+    struct sedf_edom_info* inf = EDOM_INFO(d);
+    if (is_idle_task(d->domain))
+        return DOMAIN_IDLE;
+    if (inf->status & EXTRA_RUN_PEN)
+        return DOMAIN_EXTRA_PEN;
+    if (inf->status & EXTRA_RUN_UTIL)
+        return DOMAIN_EXTRA_UTIL;
+    return DOMAIN_EDF;
 }
 /*Compares two domains in the relation of whether the one is allowed to
   interrupt the others execution.
   It returns true (!=0) if a switch to the other domain is good.
   Current Priority scheme is as follows:
-  	EDF > L0 (penalty based) extra-time > 
-  	L1 (utilization) extra-time > idle-domain
+   EDF > L0 (penalty based) extra-time > 
+   L1 (utilization) extra-time > idle-domain
   In the same class priorities are assigned as following:
-  	EDF: early deadline > late deadline
-  	L0 extra-time: lower score > higher score*/
+   EDF: early deadline > late deadline
+   L0 extra-time: lower score > higher score*/
 static inline int should_switch(struct exec_domain* cur,
-   struct exec_domain* other, s_time_t now) {
-	struct sedf_edom_info *cur_inf, *other_inf;
-	cur_inf   = EDOM_INFO(cur);
-	other_inf = EDOM_INFO(other);
-	
-	/*check whether we need to make an earlier sched-decision*/
-	if ((PERIOD_BEGIN(other_inf) < 
-	     schedule_data[other->processor].s_timer.expires))
-		return 1;
-	/*no timing-based switches need to be taken into account here*/
-	switch (get_run_type(cur)) {
-		case DOMAIN_EDF:
-			/* do not interrupt a running EDF domain */ 
-			return 0;
-		case DOMAIN_EXTRA_PEN:
-			/*check whether we also want 
-			  the L0 ex-q with lower score*/
-			if ((other_inf->status & EXTRA_WANT_PEN_Q)
-			&&  (other_inf->score[EXTRA_PEN_Q] < 
-			     cur_inf->score[EXTRA_PEN_Q]))
-				return 1;
-			else	return 0;
-		case DOMAIN_EXTRA_UTIL:
-			/*check whether we want the L0 extraq, don't
-			  switch if both domains want L1 extraq */
-			if (other_inf->status & EXTRA_WANT_PEN_Q)
-				return 1;
-			else	return 0;
-		case DOMAIN_IDLE:
-			return 1;
-	}
-	return 1;
+                                struct exec_domain* other, s_time_t now) {
+    struct sedf_edom_info *cur_inf, *other_inf;
+    cur_inf   = EDOM_INFO(cur);
+    other_inf = EDOM_INFO(other);
+ 
+ /*check whether we need to make an earlier sched-decision*/
+    if ((PERIOD_BEGIN(other_inf) < 
+         schedule_data[other->processor].s_timer.expires))
+        return 1;
+    /*no timing-based switches need to be taken into account here*/
+    switch (get_run_type(cur)) {
+    case DOMAIN_EDF:
+        /* do not interrupt a running EDF domain */ 
+        return 0;
+    case DOMAIN_EXTRA_PEN:
+        /*check whether we also want 
+          the L0 ex-q with lower score*/
+        if ((other_inf->status & EXTRA_WANT_PEN_Q)
+            &&  (other_inf->score[EXTRA_PEN_Q] < 
+                 cur_inf->score[EXTRA_PEN_Q]))
+            return 1;
+        else return 0;
+    case DOMAIN_EXTRA_UTIL:
+        /*check whether we want the L0 extraq, don't
+          switch if both domains want L1 extraq */
+        if (other_inf->status & EXTRA_WANT_PEN_Q)
+            return 1;
+        else return 0;
+    case DOMAIN_IDLE:
+        return 1;
+    }
+    return 1;
 }
 void sedf_wake(struct exec_domain *d) {
-	s_time_t              now = NOW();
-	struct sedf_edom_info* inf = EDOM_INFO(d);
-	
-	PRINT(3, "sedf_wake was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id);
-	
-	if (unlikely(is_idle_task(d->domain)))
-		return;
-			
-	if ( unlikely(__task_on_queue(d)) ) {
-		PRINT(3,"\tdomain %i.%i is already in some queue\n",
-		      d->domain->domain_id, d->vcpu_id);
-		return;
-	}
-	ASSERT(!sedf_runnable(d));
-	inf->status &= ~SEDF_ASLEEP;
-	ASSERT(!extraq_on(d, EXTRA_UTIL_Q));
-	ASSERT(!extraq_on(d, EXTRA_PEN_Q));
-	
-	if (unlikely(inf->deadl_abs == 0))
-		/*initial setup of the deadline*/
-		inf->deadl_abs = now + inf->slice;
-		
-	PRINT(3,"waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\
-	        "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs,
-		 inf->period, now);
-#ifdef SEDF_STATS	
-	inf->block_tot++;
+    s_time_t              now = NOW();
+    struct sedf_edom_info* inf = EDOM_INFO(d);
+ 
+    PRINT(3, "sedf_wake was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id);
+ 
+    if (unlikely(is_idle_task(d->domain)))
+        return;
+   
+    if ( unlikely(__task_on_queue(d)) ) {
+        PRINT(3,"\tdomain %i.%i is already in some queue\n",
+              d->domain->domain_id, d->vcpu_id);
+        return;
+    }
+    ASSERT(!sedf_runnable(d));
+    inf->status &= ~SEDF_ASLEEP;
+    ASSERT(!extraq_on(d, EXTRA_UTIL_Q));
+    ASSERT(!extraq_on(d, EXTRA_PEN_Q));
+ 
+    if (unlikely(inf->deadl_abs == 0))
+        /*initial setup of the deadline*/
+        inf->deadl_abs = now + inf->slice;
+  
+    PRINT(3,"waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\
+          "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs,
+          inf->period, now);
+#ifdef SEDF_STATS 
+    inf->block_tot++;
+#endif
+    if (unlikely(now < PERIOD_BEGIN(inf))) {
+        PRINT(4,"extratime unblock\n");
+        /* unblocking in extra-time! */
+#if (EXTRA == EXTRA_BLOCK_WEIGHT)
+        if (inf->status & EXTRA_WANT_PEN_Q) {
+            /*we have a domain that wants compensation
+              for block penalty and did just block in
+              its compensation time. Give it another
+              chance!*/
+            extraq_add_sort_update(d, EXTRA_PEN_Q, 0);
+        }
 #endif
-	if (unlikely(now < PERIOD_BEGIN(inf))) {
-		PRINT(4,"extratime unblock\n");
-		/* unblocking in extra-time! */
-		#if (EXTRA == EXTRA_BLOCK_WEIGHT)
-		if (inf->status & EXTRA_WANT_PEN_Q) {
-			/*we have a domain that wants compensation
-			  for block penalty and did just block in
-			  its compensation time. Give it another
-			  chance!*/
-			extraq_add_sort_update(d, EXTRA_PEN_Q, 0);
-		}
-		#endif
-		extraq_check_add_unblocked(d, 0);
-	}		
-	else {		
-		if (now < inf->deadl_abs) {
-			PRINT(4,"short unblocking\n");
-			/*short blocking*/
+        extraq_check_add_unblocked(d, 0);
+    }  
+    else {  
+        if (now < inf->deadl_abs) {
+            PRINT(4,"short unblocking\n");
+            /*short blocking*/
 #ifdef SEDF_STATS
-			inf->short_block_tot++;
+            inf->short_block_tot++;
+#endif
+#if (UNBLOCK <= UNBLOCK_ATROPOS)
+            unblock_short_vcons(inf, now);
+#elif (UNBLOCK == UNBLOCK_SHORT_RESUME)
+            unblock_short_cons(inf, now);
+#elif (UNBLOCK == UNBLOCK_BURST)
+            unblock_short_burst(inf, now);
+#elif (UNBLOCK == UNBLOCK_EXTRA_SUPPORT)
+            unblock_short_extra_support(inf, now);
 #endif
-			#if (UNBLOCK <= UNBLOCK_ATROPOS)
-			unblock_short_vcons(inf, now);
-			#elif (UNBLOCK == UNBLOCK_SHORT_RESUME)
-			unblock_short_cons(inf, now);
-			#elif (UNBLOCK == UNBLOCK_BURST)
-			unblock_short_burst(inf, now);
-			#elif (UNBLOCK == UNBLOCK_EXTRA_SUPPORT)
-			unblock_short_extra_support(inf, now);
-			#endif
 
-			extraq_check_add_unblocked(d, 1);
-		}
-		else {
-			PRINT(4,"long unblocking\n");
-			/*long unblocking*/
+            extraq_check_add_unblocked(d, 1);
+        }
+        else {
+            PRINT(4,"long unblocking\n");
+            /*long unblocking*/
 #ifdef SEDF_STATS
-			inf->long_block_tot++;
+            inf->long_block_tot++;
+#endif
+#if (UNBLOCK == UNBLOCK_ISOCHRONOUS_EDF)
+            unblock_long_vcons(inf, now);
+#elif (UNBLOCK == UNBLOCK_EDF \
+       || UNBLOCK == UNBLOCK_EXTRA_SUPPORT)
+            unblock_long_cons_b(inf, now);
+#elif (UNBLOCK == UNBLOCK_ATROPOS)
+            unblock_long_cons_c(inf, now);
+#elif (UNBLOCK == UNBLOCK_SHORT_RESUME)
+            unblock_long_cons_b(inf, now);
+            /*unblock_short_cons_c(inf, now);*/
+#elif (UNBLOCK == UNBLOCK_BURST)
+            unblock_long_burst(inf, now);
 #endif
-			#if (UNBLOCK == UNBLOCK_ISOCHRONOUS_EDF)
-			unblock_long_vcons(inf, now);
-			#elif (UNBLOCK == UNBLOCK_EDF \
-			    || UNBLOCK == UNBLOCK_EXTRA_SUPPORT)
-			unblock_long_cons_b(inf, now);
-			#elif (UNBLOCK == UNBLOCK_ATROPOS)
-			unblock_long_cons_c(inf, now);
-			#elif (UNBLOCK == UNBLOCK_SHORT_RESUME)
-			unblock_long_cons_b(inf, now);
-			/*unblock_short_cons_c(inf, now);*/
-			#elif (UNBLOCK == UNBLOCK_BURST)
-			unblock_long_burst(inf, now);
-			#endif
 
-			extraq_check_add_unblocked(d, 1);
-		}
-	}
-	PRINT(3,"woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\
-	        "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs,
-		inf->period, now);
-	if (PERIOD_BEGIN(inf) > now) {
-		__add_to_waitqueue_sort(d);
-		PRINT(3,"added to waitq\n");
-	}
-	else {
-		__add_to_runqueue_sort(d);
-		PRINT(3,"added to runq\n");
-	}
-	
+            extraq_check_add_unblocked(d, 1);
+        }
+    }
+    PRINT(3,"woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\
+          "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs,
+          inf->period, now);
+    if (PERIOD_BEGIN(inf) > now) {
+        __add_to_waitqueue_sort(d);
+        PRINT(3,"added to waitq\n");
+    }
+    else {
+        __add_to_runqueue_sort(d);
+        PRINT(3,"added to runq\n");
+    }
+ 
 #ifdef SEDF_STATS
-	/*do some statistics here...*/
-	if (inf->block_abs != 0) {
-		inf->block_time_tot += now - inf->block_abs;
-		inf->penalty_time_tot +=
-		   PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs;
-	}
-#endif
-	/*sanity check: make sure each extra-aware domain IS on the util-q!*/
-	ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q)));
-	ASSERT(__task_on_queue(d));
-	/*check whether the awakened task needs to invoke the do_schedule
-	  routine. Try to avoid unnecessary runs but:
-	  Save approximation: Always switch to scheduler!*/
-	if (should_switch(schedule_data[d->processor].curr, d, now)){
-#ifdef ADV_SCHED_HISTO
-		adv_sched_hist_start(d->processor);
+    /*do some statistics here...*/
+    if (inf->block_abs != 0) {
+        inf->block_time_tot += now - inf->block_abs;
+        inf->penalty_time_tot +=
+            PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs;
+    }
 #endif
-		cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
-	}
+    /*sanity check: make sure each extra-aware domain IS on the util-q!*/
+    ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q)));
+    ASSERT(__task_on_queue(d));
+    /*check whether the awakened task needs to invoke the do_schedule
+      routine. Try to avoid unnecessary runs but:
+      Save approximation: Always switch to scheduler!*/
+    if (should_switch(schedule_data[d->processor].curr, d, now))
+        cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ);
 }
 
 /*Print a lot of use-{full, less} information about a domains in the system*/
 static void sedf_dump_domain(struct exec_domain *d) {
-	printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
-		test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
-	printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64" sc=%i xtr(%s)=%"PRIu64" ew=%hu",
-	  EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
-	  EDOM_INFO(d)->weight, d->cpu_time, EDOM_INFO(d)->score[EXTRA_UTIL_Q],
-	 (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
-	  EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
-	if (d->cpu_time !=0)
-		printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
-		                 / d->cpu_time);
+    printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id,
+           test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F');
+    printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64" sc=%i xtr(%s)=%"PRIu64" ew=%hu",
+           EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs,
+           EDOM_INFO(d)->weight, d->cpu_time, EDOM_INFO(d)->score[EXTRA_UTIL_Q],
+           (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no",
+           EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight);
+    if (d->cpu_time !=0)
+        printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100)
+               / d->cpu_time);
 #ifdef SEDF_STATS
-	if (EDOM_INFO(d)->block_time_tot!=0)
-		printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
-		                     EDOM_INFO(d)->block_time_tot);
-	if (EDOM_INFO(d)->block_tot!=0)
-		printf("\n   blks=%u sh=%u (%u%%) (shc=%u (%u%%) shex=%i "\
-		       "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"",
-		    EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot,
-		   (EDOM_INFO(d)->short_block_tot * 100) 
-		  / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_cont,
-		   (EDOM_INFO(d)->short_cont * 100) / EDOM_INFO(d)->block_tot,
-		    EDOM_INFO(d)->pen_extra_blocks,
-		    EDOM_INFO(d)->pen_extra_slices,
-		    EDOM_INFO(d)->long_block_tot,
-		   (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot,
-		   (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot,
-		   (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot);
+    if (EDOM_INFO(d)->block_time_tot!=0)
+        printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) /
+               EDOM_INFO(d)->block_time_tot);
+    if (EDOM_INFO(d)->block_tot!=0)
+        printf("\n   blks=%u sh=%u (%u%%) (shc=%u (%u%%) shex=%i "\
+               "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"",
+               EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot,
+               (EDOM_INFO(d)->short_block_tot * 100) 
+               / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_cont,
+               (EDOM_INFO(d)->short_cont * 100) / EDOM_INFO(d)->block_tot,
+               EDOM_INFO(d)->pen_extra_blocks,
+               EDOM_INFO(d)->pen_extra_slices,
+               EDOM_INFO(d)->long_block_tot,
+               (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot,
+               (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot,
+               (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot);
 #endif
-	printf("\n");
+    printf("\n");
 }
 
 /*dumps all domains on hte specified cpu*/
 static void sedf_dump_cpu_state(int i)
 {
-	struct list_head      *list, *queue, *tmp;
-	struct sedf_edom_info *d_inf;
-	struct domain         *d;
-	struct exec_domain    *ed;
-	int loop = 0;
-	
-	printk("now=%"PRIu64"\n",NOW());
-	queue = RUNQ(i);
-	printk("RUNQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
-		(unsigned long) queue->next, (unsigned long) queue->prev);
-	list_for_each_safe ( list, tmp, queue ) {
-		printk("%3d: ",loop++);
-		d_inf = list_entry(list, struct sedf_edom_info, list);
-		sedf_dump_domain(d_inf->exec_domain);
-	}
-	
-	queue = WAITQ(i); loop = 0;
-	printk("\nWAITQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
-		(unsigned long) queue->next, (unsigned long) queue->prev);
-	list_for_each_safe ( list, tmp, queue ) {
-		printk("%3d: ",loop++);
-		d_inf = list_entry(list, struct sedf_edom_info, list);
-		sedf_dump_domain(d_inf->exec_domain);
-	}
-	
-	queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0;
-	printk("\nEXTRAQ (penalty) rq %lx   n: %lx, p: %lx\n",
-	       (unsigned long)queue, (unsigned long) queue->next,
-	       (unsigned long) queue->prev);
-	list_for_each_safe ( list, tmp, queue ) {
-		d_inf = list_entry(list, struct sedf_edom_info,
-		                   extralist[EXTRA_PEN_Q]);
-		printk("%3d: ",loop++);
-		sedf_dump_domain(d_inf->exec_domain);
-	}
-	
-	queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0;
-	printk("\nEXTRAQ (utilization) rq %lx   n: %lx, p: %lx\n",
-	       (unsigned long)queue, (unsigned long) queue->next,
-	       (unsigned long) queue->prev);
-	list_for_each_safe ( list, tmp, queue )	{
-		d_inf = list_entry(list, struct sedf_edom_info,
-		                   extralist[EXTRA_UTIL_Q]);
-		printk("%3d: ",loop++);
-		sedf_dump_domain(d_inf->exec_domain);
-	}
-	
-	loop = 0;
-	printk("\nnot on Q\n");
-	for_each_domain(d)
-		for_each_exec_domain(d, ed)
-		{
-			if (!__task_on_queue(ed) && (ed->processor == i)) {
-				printk("%3d: ",loop++);
-				sedf_dump_domain(ed);
-			}
-		}
+    struct list_head      *list, *queue, *tmp;
+    struct sedf_edom_info *d_inf;
+    struct domain         *d;
+    struct exec_domain    *ed;
+    int loop = 0;
+ 
+    printk("now=%"PRIu64"\n",NOW());
+    queue = RUNQ(i);
+    printk("RUNQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
+           (unsigned long) queue->next, (unsigned long) queue->prev);
+    list_for_each_safe ( list, tmp, queue ) {
+        printk("%3d: ",loop++);
+        d_inf = list_entry(list, struct sedf_edom_info, list);
+        sedf_dump_domain(d_inf->exec_domain);
+    }
+ 
+    queue = WAITQ(i); loop = 0;
+    printk("\nWAITQ rq %lx   n: %lx, p: %lx\n",  (unsigned long)queue,
+           (unsigned long) queue->next, (unsigned long) queue->prev);
+    list_for_each_safe ( list, tmp, queue ) {
+        printk("%3d: ",loop++);
+        d_inf = list_entry(list, struct sedf_edom_info, list);
+        sedf_dump_domain(d_inf->exec_domain);
+    }
+ 
+    queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0;
+    printk("\nEXTRAQ (penalty) rq %lx   n: %lx, p: %lx\n",
+           (unsigned long)queue, (unsigned long) queue->next,
+           (unsigned long) queue->prev);
+    list_for_each_safe ( list, tmp, queue ) {
+        d_inf = list_entry(list, struct sedf_edom_info,
+                           extralist[EXTRA_PEN_Q]);
+        printk("%3d: ",loop++);
+        sedf_dump_domain(d_inf->exec_domain);
+    }
+ 
+    queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0;
+    printk("\nEXTRAQ (utilization) rq %lx   n: %lx, p: %lx\n",
+           (unsigned long)queue, (unsigned long) queue->next,
+           (unsigned long) queue->prev);
+    list_for_each_safe ( list, tmp, queue ) {
+        d_inf = list_entry(list, struct sedf_edom_info,
+                           extralist[EXTRA_UTIL_Q]);
+        printk("%3d: ",loop++);
+        sedf_dump_domain(d_inf->exec_domain);
+    }
+ 
+    loop = 0;
+    printk("\nnot on Q\n");
+    for_each_domain(d)
+        for_each_exec_domain(d, ed)
+    {
+        if (!__task_on_queue(ed) && (ed->processor == i)) {
+            printk("%3d: ",loop++);
+            sedf_dump_domain(ed);
+        }
+    }
 }
 /*Adjusts periods and slices of the domains accordingly to their weights*/
 static inline int sedf_adjust_weights(struct sched_adjdom_cmd *cmd) {
-	struct exec_domain *p;
-	struct domain      *d;
-	int                 sumw[NR_CPUS];
-	s_time_t            sumt[NR_CPUS];
-	int                 cpu;
-	
-	for (cpu=0; cpu < NR_CPUS; cpu++) {
-		sumw[cpu] = 0;
-		sumt[cpu] = 0;
-	}
-	/*sum up all weights*/
-	for_each_domain(d)
-	  for_each_exec_domain(d, p) {
-		if (EDOM_INFO(p)->weight)
-			sumw[p->processor] += EDOM_INFO(p)->weight;
-		else {
-			/*don't modify domains who don't have a weight, but sum
-			  up the time they need, projected to a WEIGHT_PERIOD,
-			  so that this time is not given to the weight-driven
-			  domains*/
-			/*check for overflows*/
-			ASSERT((WEIGHT_PERIOD < ULONG_MAX) 
-			    && (EDOM_INFO(p)->slice_orig < ULONG_MAX));
-			sumt[p->processor] += (WEIGHT_PERIOD *
-			    EDOM_INFO(p)->slice_orig) / EDOM_INFO(p)->period_orig;
-		}
-	}
-	/*adjust all slices (and periods) to the new weight*/
-	for_each_domain(d) 
-	  for_each_exec_domain(d, p) {
-		if (EDOM_INFO(p)->weight) {
-			EDOM_INFO(p)->period_orig = 
-			     EDOM_INFO(p)->period = WEIGHT_PERIOD;
-			EDOM_INFO(p)->slice_orig  =
-			      EDOM_INFO(p)->slice = (EDOM_INFO(p)->weight *
-			      (WEIGHT_PERIOD -WEIGHT_SAFETY -
-			       sumt[p->processor])) / sumw[p->processor];
-		}
-	}
-	return 0;
+    struct exec_domain *p;
+    struct domain      *d;
+    int                 sumw[NR_CPUS];
+    s_time_t            sumt[NR_CPUS];
+    int                 cpu;
+ 
+    for (cpu=0; cpu < NR_CPUS; cpu++) {
+        sumw[cpu] = 0;
+        sumt[cpu] = 0;
+    }
+    /*sum up all weights*/
+    for_each_domain(d)
+        for_each_exec_domain(d, p) {
+        if (EDOM_INFO(p)->weight)
+            sumw[p->processor] += EDOM_INFO(p)->weight;
+        else {
+            /*don't modify domains who don't have a weight, but sum
+              up the time they need, projected to a WEIGHT_PERIOD,
+              so that this time is not given to the weight-driven
+              domains*/
+            /*check for overflows*/
+            ASSERT((WEIGHT_PERIOD < ULONG_MAX) 
+                   && (EDOM_INFO(p)->slice_orig < ULONG_MAX));
+            sumt[p->processor] += 
+                (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / 
+                EDOM_INFO(p)->period_orig;
+        }
+    }
+    /*adjust all slices (and periods) to the new weight*/
+    for_each_domain(d) 
+        for_each_exec_domain(d, p) {
+        if (EDOM_INFO(p)->weight) {
+            EDOM_INFO(p)->period_orig = 
+                EDOM_INFO(p)->period  = WEIGHT_PERIOD;
+            EDOM_INFO(p)->slice_orig  =
+                EDOM_INFO(p)->slice   = 
+                (EDOM_INFO(p)->weight *
+                 (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[p->processor])) / 
+                sumw[p->processor];
+        }
+    }
+    return 0;
 }
 
 /* set or fetch domain scheduling parameters */
 static int sedf_adjdom(struct domain *p, struct sched_adjdom_cmd *cmd) {
-	struct exec_domain *ed;
+    struct exec_domain *ed;
 
-	PRINT(2,"sedf_adjdom was called, domain-id %i new period %"PRIu64" "\
-	        "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n",
-		p->domain_id, cmd->u.sedf.period, cmd->u.sedf.slice,
-		cmd->u.sedf.latency, (cmd->u.sedf.extratime)?"yes":"no");
-	if ( cmd->direction == SCHED_INFO_PUT )
-	{
-		/*check for sane parameters*/
-		if (!cmd->u.sedf.period && !cmd->u.sedf.weight)
-			return -EINVAL;
-		if (cmd->u.sedf.weight) {
-			if ((cmd->u.sedf.extratime & EXTRA_AWARE) &&
-			    (! cmd->u.sedf.period)) {
-			/*weight driven domains with xtime ONLY!*/
-				for_each_exec_domain(p, ed) {
-				  EDOM_INFO(ed)->extraweight = cmd->u.sedf.weight;
-				  EDOM_INFO(ed)->weight = 0;
-				  EDOM_INFO(ed)->slice = 0;
-				  EDOM_INFO(ed)->period = WEIGHT_PERIOD;
-				}
-			} else {
-			/*weight driven domains with real-time execution*/
-				for_each_exec_domain(p, ed)
-				  EDOM_INFO(ed)->weight = cmd->u.sedf.weight;
-			}
-		}
-		else {
-			/*time driven domains*/
-			for_each_exec_domain(p, ed) {
-				/* sanity checking! */
-				if(cmd->u.sedf.slice > cmd->u.sedf.period )
-					return -EINVAL;
-				EDOM_INFO(ed)->weight = 0;
-				EDOM_INFO(ed)->extraweight = 0;
-				EDOM_INFO(ed)->period_orig = 
-				EDOM_INFO(ed)->period   = cmd->u.sedf.period;
-				EDOM_INFO(ed)->slice_orig  = 
-				EDOM_INFO(ed)->slice    = cmd->u.sedf.slice;
-			}
-		}
-		if (sedf_adjust_weights(cmd))
-			return -EINVAL;
-			
-		for_each_exec_domain(p, ed) {
-			EDOM_INFO(ed)->status  = (EDOM_INFO(ed)->status &
-			  ~EXTRA_AWARE) | (cmd->u.sedf.extratime & EXTRA_AWARE);
-			EDOM_INFO(ed)->latency = cmd->u.sedf.latency;
-			extraq_check(ed);
-		}
-	}
-	else if ( cmd->direction == SCHED_INFO_GET )
-	{
-		cmd->u.sedf.period    = EDOM_INFO(p->exec_domain[0])->period;
-		cmd->u.sedf.slice     = EDOM_INFO(p->exec_domain[0])->slice;
-		cmd->u.sedf.extratime = EDOM_INFO(p->exec_domain[0])->status
-		                            & EXTRA_AWARE;
-		cmd->u.sedf.latency   = EDOM_INFO(p->exec_domain[0])->latency;
-		cmd->u.sedf.weight    = EDOM_INFO(p->exec_domain[0])->weight;
-	}
-	PRINT(2,"sedf_adjdom_finished\n");
-	return 0;
+    PRINT(2,"sedf_adjdom was called, domain-id %i new period %"PRIu64" "\
+          "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n",
+          p->domain_id, cmd->u.sedf.period, cmd->u.sedf.slice,
+          cmd->u.sedf.latency, (cmd->u.sedf.extratime)?"yes":"no");
+    if ( cmd->direction == SCHED_INFO_PUT )
+    {
+        /*check for sane parameters*/
+        if (!cmd->u.sedf.period && !cmd->u.sedf.weight)
+            return -EINVAL;
+        if (cmd->u.sedf.weight) {
+            if ((cmd->u.sedf.extratime & EXTRA_AWARE) &&
+                (! cmd->u.sedf.period)) {
+                /*weight driven domains with xtime ONLY!*/
+                for_each_exec_domain(p, ed) {
+                    EDOM_INFO(ed)->extraweight = cmd->u.sedf.weight;
+                    EDOM_INFO(ed)->weight = 0;
+                    EDOM_INFO(ed)->slice = 0;
+                    EDOM_INFO(ed)->period = WEIGHT_PERIOD;
+                }
+            } else {
+                /*weight driven domains with real-time execution*/
+                for_each_exec_domain(p, ed)
+                    EDOM_INFO(ed)->weight = cmd->u.sedf.weight;
+            }
+        }
+        else {
+            /*time driven domains*/
+            for_each_exec_domain(p, ed) {
+                /* sanity checking! */
+                if(cmd->u.sedf.slice > cmd->u.sedf.period )
+                    return -EINVAL;
+                EDOM_INFO(ed)->weight = 0;
+                EDOM_INFO(ed)->extraweight = 0;
+                EDOM_INFO(ed)->period_orig = 
+                    EDOM_INFO(ed)->period   = cmd->u.sedf.period;
+                EDOM_INFO(ed)->slice_orig  = 
+                    EDOM_INFO(ed)->slice    = cmd->u.sedf.slice;
+            }
+        }
+        if (sedf_adjust_weights(cmd))
+            return -EINVAL;
+   
+        for_each_exec_domain(p, ed) {
+            EDOM_INFO(ed)->status  = 
+                (EDOM_INFO(ed)->status &
+                 ~EXTRA_AWARE) | (cmd->u.sedf.extratime & EXTRA_AWARE);
+            EDOM_INFO(ed)->latency = cmd->u.sedf.latency;
+            extraq_check(ed);
+        }
+    }
+    else if ( cmd->direction == SCHED_INFO_GET )
+    {
+        cmd->u.sedf.period    = EDOM_INFO(p->exec_domain[0])->period;
+        cmd->u.sedf.slice     = EDOM_INFO(p->exec_domain[0])->slice;
+        cmd->u.sedf.extratime = EDOM_INFO(p->exec_domain[0])->status
+            & EXTRA_AWARE;
+        cmd->u.sedf.latency   = EDOM_INFO(p->exec_domain[0])->latency;
+        cmd->u.sedf.weight    = EDOM_INFO(p->exec_domain[0])->weight;
+    }
+    PRINT(2,"sedf_adjdom_finished\n");
+    return 0;
 }
 
 struct scheduler sched_sedf_def = {
@@ -1464,11 +1439,9 @@ struct scheduler sched_sedf_def = {
     .opt_name = "sedf",
     .sched_id = SCHED_SEDF,
     
-    .init_idle_task = sedf_init_idle_task,
     .alloc_task     = sedf_alloc_task,
     .add_task       = sedf_add_task,
     .free_task      = sedf_free_task,
-    .init_scheduler = sedf_init_scheduler,
     .do_schedule    = sedf_do_schedule,
     .dump_cpu_state = sedf_dump_cpu_state,
     .sleep          = sedf_sleep,
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index d7ba0a078c..ec974657e2 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -41,11 +41,6 @@
 static char opt_sched[10] = "bvt";
 string_param("sched", opt_sched);
 
-/*#define WAKE_HISTO*/
-/*#define BLOCKTIME_HISTO*/
-/*#define ADV_SCHED_HISTO*/
-//#include <xen/adv_sched_hist.h>
-
 #if defined(WAKE_HISTO)
 #define BUCKETS 31
 #elif defined(BLOCKTIME_HISTO)
@@ -93,8 +88,8 @@ void free_domain_struct(struct domain *d)
     xfree(d);
 }
 
-struct exec_domain *alloc_exec_domain_struct(struct domain *d,
-                                             unsigned long vcpu)
+struct exec_domain *alloc_exec_domain_struct(
+    struct domain *d, unsigned long vcpu)
 {
     struct exec_domain *ed, *edc;
 
@@ -126,10 +121,10 @@ struct exec_domain *alloc_exec_domain_struct(struct domain *d,
         edc->next_in_list = ed;
 
         if (test_bit(_VCPUF_cpu_pinned, &edc->vcpu_flags)) {
-            ed->processor = (edc->processor + 1) % smp_num_cpus;
+            ed->processor = (edc->processor + 1) % num_online_cpus();
             set_bit(_VCPUF_cpu_pinned, &ed->vcpu_flags);
         } else {
-            ed->processor = (edc->processor + 1) % smp_num_cpus;  /* XXX */
+            ed->processor = (edc->processor + 1) % num_online_cpus();
         }
     }
 
@@ -168,20 +163,22 @@ void sched_add_domain(struct exec_domain *ed)
 {
     struct domain *d = ed->domain;
 
-    /* Must be unpaused by control software to start execution. */
-    set_bit(_VCPUF_ctrl_pause, &ed->vcpu_flags);
+    /* Initialise the per-domain timer. */
+    init_ac_timer(&ed->timer);
+    ed->timer.cpu      = ed->processor;
+    ed->timer.data     = (unsigned long)ed;
+    ed->timer.function = &dom_timer_fn;
 
-    if ( d->domain_id != IDLE_DOMAIN_ID )
+    if ( is_idle_task(d) )
     {
-        /* Initialise the per-domain timer. */
-        init_ac_timer(&ed->timer);
-        ed->timer.cpu      = ed->processor;
-        ed->timer.data     = (unsigned long)ed;
-        ed->timer.function = &dom_timer_fn;
+        schedule_data[ed->processor].curr = ed;
+        schedule_data[ed->processor].idle = ed;
+        set_bit(_VCPUF_running, &ed->vcpu_flags);
     }
     else
     {
-        schedule_data[ed->processor].idle = ed;
+        /* Must be unpaused by control software to start execution. */
+        set_bit(_VCPUF_ctrl_pause, &ed->vcpu_flags);
     }
 
     SCHED_OP(add_task, ed);
@@ -195,12 +192,6 @@ void sched_rem_domain(struct exec_domain *ed)
     TRACE_2D(TRC_SCHED_DOM_REM, ed->domain->domain_id, ed->vcpu_id);
 }
 
-void init_idle_task(void)
-{
-    if ( SCHED_OP(init_idle_task, current) < 0 )
-        BUG();
-}
-
 void domain_sleep(struct exec_domain *ed)
 {
     unsigned long flags;
@@ -240,10 +231,6 @@ long do_block(void)
 {
     struct exec_domain *ed = current;
 
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_start(current->processor);
-#endif
-
     ed->vcpu_info->evtchn_upcall_mask = 0;
     set_bit(_VCPUF_blocked, &ed->vcpu_flags);
 
@@ -264,10 +251,6 @@ long do_block(void)
 /* Voluntarily yield the processor for this allocation. */
 static long do_yield(void)
 {
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_start(current->processor);
-#endif
-    
     TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
     __enter_scheduler();
     return 0;
@@ -422,13 +405,7 @@ static void __enter_scheduler(void)
     
     spin_lock_irq(&schedule_data[cpu].schedule_lock);
 
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_from_stop(cpu);
-#endif
     now = NOW();
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_start(cpu);
-#endif
 
     rem_ac_timer(&schedule_data[cpu].s_timer);
     
@@ -447,7 +424,7 @@ static void __enter_scheduler(void)
     next->lastschd = now;
 
     /* reprogramm the timer */
-    schedule_data[cpu].s_timer.expires  = now + r_time;
+    schedule_data[cpu].s_timer.expires = now + r_time;
     add_ac_timer(&schedule_data[cpu].s_timer);
 
     /* Must be protected by the schedule_lock! */
@@ -455,12 +432,9 @@ static void __enter_scheduler(void)
 
     spin_unlock_irq(&schedule_data[cpu].schedule_lock);
 
-    if ( unlikely(prev == next) ) {
-#ifdef ADV_SCHED_HISTO
-        adv_sched_hist_to_stop(cpu);
-#endif
+    if ( unlikely(prev == next) )
         return continue_running(prev);
-    }
+
     perfc_incrc(sched_ctx);
 
 #if defined(WAKE_HISTO)
@@ -495,10 +469,6 @@ static void __enter_scheduler(void)
              prev->domain->domain_id, prev->vcpu_id,
              next->domain->domain_id, next->vcpu_id);
 
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_to_stop(cpu);
-#endif
-
     context_switch(prev, next);
 }
 
@@ -520,10 +490,6 @@ int idle_cpu(int cpu)
 /* The scheduler timer: force a run through the scheduler */
 static void s_timer_fn(unsigned long unused)
 {
-#ifdef ADV_SCHED_HISTO
-    adv_sched_hist_start(current->processor);
-#endif
-
     raise_softirq(SCHEDULE_SOFTIRQ);
     perfc_incrc(sched_irq);
 }
@@ -567,8 +533,7 @@ void __init scheduler_init(void)
     for ( i = 0; i < NR_CPUS; i++ )
     {
         spin_lock_init(&schedule_data[i].schedule_lock);
-        schedule_data[i].curr = &idle0_exec_domain;
-        
+
         init_ac_timer(&schedule_data[i].s_timer);
         schedule_data[i].s_timer.cpu      = i;
         schedule_data[i].s_timer.data     = 2;
@@ -580,7 +545,8 @@ void __init scheduler_init(void)
         t_timer[i].function = &t_timer_fn;
     }
 
-    schedule_data[0].idle = &idle0_exec_domain;
+    schedule_data[0].curr = idle_task[0];
+    schedule_data[0].idle = idle_task[0];
 
     for ( i = 0; schedulers[i] != NULL; i++ )
     {
@@ -594,8 +560,8 @@ void __init scheduler_init(void)
 
     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
 
-    if ( SCHED_OP(init_scheduler) < 0 )
-        panic("Initialising scheduler failed!");
+    BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0);
+    sched_add_domain(idle_task[0]);
 }
 
 /*
@@ -604,14 +570,10 @@ void __init scheduler_init(void)
  */
 void schedulers_start(void) 
 {   
-    s_timer_fn(0);
-    smp_call_function((void *)s_timer_fn, NULL, 1, 1);
-
     t_timer_fn(0);
     smp_call_function((void *)t_timer_fn, NULL, 1, 1);
 }
 
-
 void dump_runq(unsigned char key)
 {
     s_time_t      now = NOW();
@@ -624,7 +586,7 @@ void dump_runq(unsigned char key)
     SCHED_OP(dump_settings);
     printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now); 
 
-    for ( i = 0; i < smp_num_cpus; i++ )
+    for_each_online_cpu ( i )
     {
         spin_lock(&schedule_data[i].schedule_lock);
         printk("CPU[%02d] ", i);
@@ -636,10 +598,11 @@ void dump_runq(unsigned char key)
 }
 
 #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO)
+
 void print_sched_histo(unsigned char key)
 {
     int i, j, k;
-    for ( k = 0; k < smp_num_cpus; k++ )
+    for_each_online_cpu ( k )
     {
         j = 0;
         printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k);
@@ -659,73 +622,20 @@ void print_sched_histo(unsigned char key)
     }
       
 }
+
 void reset_sched_histo(unsigned char key)
 {
     int i, j;
-    for ( j = 0; j < smp_num_cpus; j++ )
+    for ( j = 0; j < NR_CPUS; j++ )
         for ( i=0; i < BUCKETS; i++ ) 
             schedule_data[j].hist[i] = 0;
 }
+
 #else
-#if defined(ADV_SCHED_HISTO)
-void print_sched_histo(unsigned char key)
-{
-    int i, j, k,t;
-    printf("Hello!\n");
-    for ( k = 0; k < smp_num_cpus; k++ )
-    {
-        j = 0;
-	t = 0;
-        printf ("CPU[%02d]: scheduler latency histogram FROM (ms:[count])\n", k);
-        for ( i = 0; i < BUCKETS; i++ )
-        {
-            //if ( schedule_data[k].hist[i] != 0 )
-            {
-	        t += schedule_data[k].from_hist[i];
-                if ( i < BUCKETS-1 )
-                    printk("%3d:[%7u]    ", i, schedule_data[k].from_hist[i]);
-                else
-                    printk(" >:[%7u]    ", schedule_data[k].from_hist[i]);
-                //if ( !(++j % 5) )
-                    printk("\n");
-            }
-        }
-        printk("\nTotal: %i\n",t);
-    }
-    for ( k = 0; k < smp_num_cpus; k++ )
-    {
-        j = 0; t = 0;
-        printf ("CPU[%02d]: scheduler latency histogram TO (ms:[count])\n", k);
-        for ( i = 0; i < BUCKETS; i++ )
-        {
-            //if ( schedule_data[k].hist[i] != 0 )
-            {
-	    	t += schedule_data[k].from_hist[i];
-                if ( i < BUCKETS-1 )
-                    printk("%3d:[%7u]    ", i, schedule_data[k].to_hist[i]);
-                else
-                    printk(" >:[%7u]    ", schedule_data[k].to_hist[i]);
-                //if ( !(++j % 5) )
-                    printk("\n");
-            }
-        }
-	printk("\nTotal: %i\n",t);
-    }
-      
-}
-void reset_sched_histo(unsigned char key)
-{
-    int i, j;
-    for ( j = 0; j < smp_num_cpus; j++ ) {
-        for ( i=0; i < BUCKETS; i++ ) 
-            schedule_data[j].to_hist[i] = schedule_data[j].from_hist[i] = 0;
-        schedule_data[j].save_tsc = 0;
-    }
-}
-#else
+
 void print_sched_histo(unsigned char key) { }
 void reset_sched_histo(unsigned char key) { }
-#endif
+
 #endif
 
 /*
diff --git a/xen/common/trace.c b/xen/common/trace.c
index 48da9a7eb7..952a2f9583 100644
--- a/xen/common/trace.c
+++ b/xen/common/trace.c
@@ -66,7 +66,7 @@ void init_trace_bufs(void)
         return;
     }
 
-    nr_pages = smp_num_cpus * opt_tbuf_size;
+    nr_pages = num_online_cpus() * opt_tbuf_size;
     order    = get_order(nr_pages * PAGE_SIZE);
     
     if ( (rawbuf = (char *)alloc_xenheap_pages(order)) == NULL )
@@ -79,7 +79,7 @@ void init_trace_bufs(void)
     for ( i = 0; i < nr_pages; i++ )
         SHARE_PFN_WITH_DOMAIN(virt_to_page(rawbuf + i * PAGE_SIZE), dom0);
     
-    for ( i = 0; i < smp_num_cpus; i++ )
+    for_each_online_cpu ( i )
     {
         buf = t_bufs[i] = (struct t_buf *)&rawbuf[i*opt_tbuf_size*PAGE_SIZE];
         
diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h
index 0fb3e44727..17777ad123 100644
--- a/xen/include/asm-x86/asm_defns.h
+++ b/xen/include/asm-x86/asm_defns.h
@@ -6,8 +6,10 @@
 #include <asm/asm-offsets.h>
 #include <asm/processor.h>
 
+#ifndef STR
 #define __STR(x) #x
 #define STR(x) __STR(x)
+#endif
 
 #ifdef __x86_64__
 #include <asm/x86_64/asm_defns.h>
diff --git a/xen/include/asm-x86/bitops.h b/xen/include/asm-x86/bitops.h
index 2337197670..500d02f38f 100644
--- a/xen/include/asm-x86/bitops.h
+++ b/xen/include/asm-x86/bitops.h
@@ -7,6 +7,11 @@
 
 #include <xen/config.h>
 
+#ifndef STR
+#define __STR(x) #x
+#define STR(x) __STR(x)
+#endif
+
 /*
  * These have to be done with inline assembly: that way the bit-setting
  * is guaranteed to be atomic. All bit operations return 0 if the bit
@@ -246,29 +251,28 @@ static __inline__ int variable_test_bit(long nr, volatile void * addr)
 /**
  * find_first_zero_bit - find the first zero bit in a memory region
  * @addr: The address to start the search at
- * @size: The maximum bitnumber to search
+ * @size: The maximum size to search
  *
  * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit. -1 when none found.
+ * containing a bit.
  */
-static __inline__ int find_first_zero_bit(void * addr, unsigned size)
+static inline long find_first_zero_bit(
+    const unsigned long *addr, unsigned size)
 {
-	int d0, d1, d2;
-	int res;
+	long d0, d1, d2;
+	long res;
 
-	if (!size)
-		return 0;
 	__asm__ __volatile__(
-		"movl $-1,%%eax\n\t"
-		"xorl %%edx,%%edx\n\t"
-		"repe; scasl\n\t"
+		"mov $-1,%%"__OP"ax\n\t"
+		"xor %%edx,%%edx\n\t"
+		"repe; scas"__OS"\n\t"
 		"je 1f\n\t"
-		"xorl -4(%%"__OP"di),%%eax\n\t"
-		"sub"__OS" $4,%%"__OP"di\n\t"
-		"bsfl %%eax,%%edx\n"
-		"1:\tsub"__OS" %%"__OP"bx,%%"__OP"di\n\t"
-		"shl"__OS" $3,%%"__OP"di\n\t"
-		"add"__OS" %%"__OP"di,%%"__OP"dx"
+		"lea -"STR(BITS_PER_LONG/8)"(%%"__OP"di),%%"__OP"di\n\t"
+		"xor (%%"__OP"di),%%"__OP"ax\n\t"
+		"bsf %%"__OP"ax,%%"__OP"dx\n"
+		"1:\tsub %%"__OP"bx,%%"__OP"di\n\t"
+		"shl $3,%%"__OP"di\n\t"
+		"add %%"__OP"di,%%"__OP"dx"
 		:"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
 		:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
 	return res;
@@ -280,66 +284,72 @@ static __inline__ int find_first_zero_bit(void * addr, unsigned size)
  * @offset: The bitnumber to start searching at
  * @size: The maximum size to search
  */
-static __inline__ int find_next_zero_bit (void * addr, int size, int offset)
-{
-	unsigned int * p = ((unsigned int *) addr) + (offset >> 5);
-	int set = 0, bit = offset & 31, res;
-	
-	if (bit) {
-		/*
-		 * Look for zero in first byte
-		 */
-		__asm__("bsfl %1,%0\n\t"
-			"jne 1f\n\t"
-			"movl $32, %0\n"
-			"1:"
-			: "=r" (set)
-			: "r" (~(*p >> bit)));
-		if (set < (32 - bit))
-			return set + offset;
-		set = 32 - bit;
-		p++;
-	}
-	/*
-	 * No zero yet, search remaining full bytes for a zero
-	 */
-	res = find_first_zero_bit (p, size - 32 * (p - (unsigned int *) addr));
-	return (offset + set + res);
-}
+long find_next_zero_bit(const unsigned long *addr, int size, int offset);
 
 /**
- * ffz - find first zero in word.
- * @word: The word to search
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
  *
- * Undefined if no zero exists, so code should check against ~0UL first.
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
  */
-static __inline__ unsigned long ffz(unsigned long word)
+static inline long find_first_bit(
+    const unsigned long *addr, unsigned size)
 {
-	__asm__("bsf"__OS" %1,%0"
-		:"=r" (word)
-		:"r" (~word));
-	return word;
+	long d0, d1;
+	long res;
+
+	__asm__ __volatile__(
+		"xor %%eax,%%eax\n\t"
+		"repe; scas"__OS"\n\t"
+		"je 1f\n\t"
+		"lea -"STR(BITS_PER_LONG/8)"(%%"__OP"di),%%"__OP"di\n\t"
+		"bsf (%%"__OP"di),%%"__OP"ax\n"
+		"1:\tsub %%"__OP"bx,%%"__OP"di\n\t"
+		"shl $3,%%"__OP"di\n\t"
+		"add %%"__OP"di,%%"__OP"ax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
+	return res;
 }
 
 /**
- * ffs - find first bit set
- * @x: the word to search
- *
- * This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
  */
-static __inline__ int ffs(int x)
-{
-	int r;
+long find_next_bit(const unsigned long *addr, int size, int offset);
 
-	__asm__("bsfl %1,%0\n\t"
-		"jnz 1f\n\t"
-		"movl $-1,%0\n"
-		"1:" : "=r" (r) : "g" (x));
-	return r+1;
+/* return index of first bet set in val or max when no bit is set */
+static inline unsigned long __scanbit(unsigned long val, unsigned long max)
+{
+	asm("bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max));
+	return val;
 }
 
+#define find_first_bit(addr,size) \
+((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \
+  (__scanbit(*(unsigned long *)addr,(size))) : \
+  find_first_bit(addr,size)))
+
+#define find_next_bit(addr,size,off) \
+((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ?         \
+  ((off) + (__scanbit((*(unsigned long *)addr) >> (off),(size)-(off)))) : \
+  find_next_bit(addr,size,off)))
+
+#define find_first_zero_bit(addr,size) \
+((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \
+  (__scanbit(~*(unsigned long *)addr,(size))) : \
+  find_first_zero_bit(addr,size)))
+        
+#define find_next_zero_bit(addr,size,off) \
+((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ?         \
+  ((off)+(__scanbit(~(((*(unsigned long *)addr)) >> (off)),(size)-(off)))) : \
+  find_next_zero_bit(addr,size,off)))
+
+
 /*
  * These are the preferred 'find first' functions in Xen.
  * Both return the appropriate bit index, with the l.s.b. having index 0.
diff --git a/xen/include/asm-x86/div64.h b/xen/include/asm-x86/div64.h
index ef915df700..28ed8b296a 100644
--- a/xen/include/asm-x86/div64.h
+++ b/xen/include/asm-x86/div64.h
@@ -1,17 +1,48 @@
 #ifndef __I386_DIV64
 #define __I386_DIV64
 
+/*
+ * do_div() is NOT a C function. It wants to return
+ * two values (the quotient and the remainder), but
+ * since that doesn't work very well in C, what it
+ * does is:
+ *
+ * - modifies the 64-bit dividend _in_place_
+ * - returns the 32-bit remainder
+ *
+ * This ends up being the most efficient "calling
+ * convention" on x86.
+ */
 #define do_div(n,base) ({ \
-	unsigned long __upper, __low, __high, __mod; \
+	unsigned long __upper, __low, __high, __mod, __base; \
+	__base = (base); \
 	asm("":"=a" (__low), "=d" (__high):"A" (n)); \
 	__upper = __high; \
 	if (__high) { \
-		__upper = __high % (base); \
-		__high = __high / (base); \
+		__upper = __high % (__base); \
+		__high = __high / (__base); \
 	} \
-	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (base), "0" (__low), "1" (__upper)); \
+	asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \
 	asm("":"=A" (n):"a" (__low),"d" (__high)); \
 	__mod; \
 })
 
+/*
+ * (long)X = ((long long)divs) / (long)div
+ * (long)rem = ((long long)divs) % (long)div
+ *
+ * Warning, this will do an exception if X overflows.
+ */
+#define div_long_long_rem(a,b,c) div_ll_X_l_rem(a,b,c)
+
+extern inline long
+div_ll_X_l_rem(long long divs, long div, long *rem)
+{
+	long dum2;
+      __asm__("divl %2":"=a"(dum2), "=d"(*rem)
+      :	"rm"(div), "A"(divs));
+
+	return dum2;
+
+}
 #endif
diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h
index 8f48465cb2..810bf345b7 100644
--- a/xen/include/asm-x86/flushtlb.h
+++ b/xen/include/asm-x86/flushtlb.h
@@ -93,7 +93,7 @@ extern void write_cr3(unsigned long cr3);
 #define local_flush_tlb_one(__addr) \
     __asm__ __volatile__("invlpg %0": :"m" (*(char *) (__addr)))
 
-#define flush_tlb_all()     flush_tlb_mask((1 << smp_num_cpus) - 1)
+#define flush_tlb_all()     flush_tlb_mask((1 << num_online_cpus()) - 1)
 
 #ifndef CONFIG_SMP
 #define flush_tlb_all_pge()          local_flush_tlb_pge()
diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h
index 6036e849c5..97f143ad44 100644
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -21,38 +21,31 @@ extern void (*interrupt[NR_IRQS])(void);
 
 #define platform_legacy_irq(irq)	((irq) < 16)
 
-extern void mask_irq(unsigned int irq);
-extern void unmask_irq(unsigned int irq);
-extern void disable_8259A_irq(unsigned int irq);
-extern void enable_8259A_irq(unsigned int irq);
-extern int i8259A_irq_pending(unsigned int irq);
-extern void make_8259A_irq(unsigned int irq);
-extern void init_8259A(int aeoi);
-extern void send_IPI_self(int vector);
-extern void init_VISWS_APIC_irqs(void);
-extern void setup_IO_APIC(void);
-extern void disable_IO_APIC(void);
-extern void print_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
-extern void send_IPI(int dest, int vector);
+void disable_8259A_irq(unsigned int irq);
+void enable_8259A_irq(unsigned int irq);
+int i8259A_irq_pending(unsigned int irq);
+void make_8259A_irq(unsigned int irq);
+void init_8259A(int aeoi);
+void send_IPI_self(int vector);
+void init_VISWS_APIC_irqs(void);
+void setup_IO_APIC(void);
+void disable_IO_APIC(void);
+void print_IO_APIC(void);
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+void send_IPI(int dest, int vector);
+void setup_ioapic_dest(void);
 
 extern unsigned long io_apic_irqs;
 
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 
-extern char _stext, _etext;
-
 #define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
 
-#include <xen/irq.h>
-
 static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i)
 {
-#if defined(CONFIG_X86_IO_APIC)
     if (IO_APIC_IRQ(i))
         send_IPI_self(IO_APIC_VECTOR(i));
-#endif
 }
 
 #endif /* _ASM_HW_IRQ_H */
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index 907c820c12..94b82d4ba3 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -179,6 +179,7 @@ extern struct cpuinfo_x86 cpu_data[];
 #define current_cpu_data boot_cpu_data
 #endif
 
+extern  int phys_proc_id[NR_CPUS];
 extern char ignore_irq13;
 
 extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/xen/include/xen/bitmap.h b/xen/include/xen/bitmap.h
index c91a10aef0..3703384c3d 100644
--- a/xen/include/xen/bitmap.h
+++ b/xen/include/xen/bitmap.h
@@ -6,6 +6,7 @@
 #include <xen/config.h>
 #include <xen/lib.h>
 #include <xen/types.h>
+#include <xen/bitops.h>
 
 /*
  * bitmaps provide bit arrays that consume one or more unsigned
diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h
index 4881447842..f4ec7a2436 100644
--- a/xen/include/xen/cpumask.h
+++ b/xen/include/xen/cpumask.h
@@ -1,27 +1,378 @@
+#ifndef __XEN_CPUMASK_H
+#define __XEN_CPUMASK_H
+
 /*
- * XXX This to be replaced with the Linux file in the near future.
+ * Cpumasks provide a bitmap suitable for representing the
+ * set of CPU's in a system, one bit position per CPU number.
+ *
+ * See detailed comments in the file xen/bitmap.h describing the
+ * data type on which these cpumasks are based.
+ *
+ * For details of cpumask_scnprintf() and cpumask_parse(),
+ * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c.
+ *
+ * The available cpumask operations are:
+ *
+ * void cpu_set(cpu, mask)		turn on bit 'cpu' in mask
+ * void cpu_clear(cpu, mask)		turn off bit 'cpu' in mask
+ * void cpus_setall(mask)		set all bits
+ * void cpus_clear(mask)		clear all bits
+ * int cpu_isset(cpu, mask)		true iff bit 'cpu' set in mask
+ * int cpu_test_and_set(cpu, mask)	test and set bit 'cpu' in mask
+ *
+ * void cpus_and(dst, src1, src2)	dst = src1 & src2  [intersection]
+ * void cpus_or(dst, src1, src2)	dst = src1 | src2  [union]
+ * void cpus_xor(dst, src1, src2)	dst = src1 ^ src2
+ * void cpus_andnot(dst, src1, src2)	dst = src1 & ~src2
+ * void cpus_complement(dst, src)	dst = ~src
+ *
+ * int cpus_equal(mask1, mask2)		Does mask1 == mask2?
+ * int cpus_intersects(mask1, mask2)	Do mask1 and mask2 intersect?
+ * int cpus_subset(mask1, mask2)	Is mask1 a subset of mask2?
+ * int cpus_empty(mask)			Is mask empty (no bits sets)?
+ * int cpus_full(mask)			Is mask full (all bits sets)?
+ * int cpus_weight(mask)		Hamming weigh - number of set bits
+ *
+ * void cpus_shift_right(dst, src, n)	Shift right
+ * void cpus_shift_left(dst, src, n)	Shift left
+ *
+ * int first_cpu(mask)			Number lowest set bit, or NR_CPUS
+ * int next_cpu(cpu, mask)		Next cpu past 'cpu', or NR_CPUS
+ *
+ * cpumask_t cpumask_of_cpu(cpu)	Return cpumask with bit 'cpu' set
+ * CPU_MASK_ALL				Initializer - all bits set
+ * CPU_MASK_NONE			Initializer - no bits set
+ * unsigned long *cpus_addr(mask)	Array of unsigned long's in mask
+ *
+ * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing
+ * int cpumask_parse(ubuf, ulen, mask)	Parse ascii string as cpumask
+ *
+ * for_each_cpu_mask(cpu, mask)		for-loop cpu over mask
+ *
+ * int num_online_cpus()		Number of online CPUs
+ * int num_possible_cpus()		Number of all possible CPUs
+ * int num_present_cpus()		Number of present CPUs
+ *
+ * int cpu_online(cpu)			Is some cpu online?
+ * int cpu_possible(cpu)		Is some cpu possible?
+ * int cpu_present(cpu)			Is some cpu present (can schedule)?
+ *
+ * int any_online_cpu(mask)		First online cpu in mask
+ *
+ * for_each_cpu(cpu)			for-loop cpu over cpu_possible_map
+ * for_each_online_cpu(cpu)		for-loop cpu over cpu_online_map
+ * for_each_present_cpu(cpu)		for-loop cpu over cpu_present_map
+ *
+ * Subtlety:
+ * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway)
+ *    to generate slightly worse code.  Note for example the additional
+ *    40 lines of assembly code compiling the "for each possible cpu"
+ *    loops buried in the disk_stat_read() macros calls when compiling
+ *    drivers/block/genhd.c (arch i386, CONFIG_SMP=y).  So use a simple
+ *    one-line #define for cpu_isset(), instead of wrapping an inline
+ *    inside a macro, the way we do the other calls.
  */
 
-#ifndef __XEN_CPUMASK_H__
-#define __XEN_CPUMASK_H__
-
+#include <xen/config.h>
 #include <xen/bitmap.h>
+#include <xen/kernel.h>
 
-typedef u32 cpumask_t;
+typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
+extern cpumask_t _unused_cpumask_arg_;
+
+#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst))
+static inline void __cpu_set(int cpu, volatile cpumask_t *dstp)
+{
+	set_bit(cpu, dstp->bits);
+}
+
+#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst))
+static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp)
+{
+	clear_bit(cpu, dstp->bits);
+}
+
+#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS)
+static inline void __cpus_setall(cpumask_t *dstp, int nbits)
+{
+	bitmap_fill(dstp->bits, nbits);
+}
+
+#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS)
+static inline void __cpus_clear(cpumask_t *dstp, int nbits)
+{
+	bitmap_zero(dstp->bits, nbits);
+}
+
+/* No static inline type checking - see Subtlety (1) above. */
+#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits)
+
+#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask))
+static inline int __cpu_test_and_set(int cpu, cpumask_t *addr)
+{
+	return test_and_set_bit(cpu, addr->bits);
+}
+
+#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_andnot(dst, src1, src2) \
+				__cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS)
+static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS)
+static inline void __cpus_complement(cpumask_t *dstp,
+					const cpumask_t *srcp, int nbits)
+{
+	bitmap_complement(dstp->bits, srcp->bits, nbits);
+}
+
+#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_equal(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_equal(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_intersects(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_intersects(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS)
+static inline int __cpus_subset(const cpumask_t *src1p,
+					const cpumask_t *src2p, int nbits)
+{
+	return bitmap_subset(src1p->bits, src2p->bits, nbits);
+}
+
+#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS)
+static inline int __cpus_empty(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_empty(srcp->bits, nbits);
+}
+
+#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS)
+static inline int __cpus_full(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_full(srcp->bits, nbits);
+}
+
+#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS)
+static inline int __cpus_weight(const cpumask_t *srcp, int nbits)
+{
+	return bitmap_weight(srcp->bits, nbits);
+}
+
+#define cpus_shift_right(dst, src, n) \
+			__cpus_shift_right(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_right(cpumask_t *dstp,
+					const cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define cpus_shift_left(dst, src, n) \
+			__cpus_shift_left(&(dst), &(src), (n), NR_CPUS)
+static inline void __cpus_shift_left(cpumask_t *dstp,
+					const cpumask_t *srcp, int n, int nbits)
+{
+	bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
+}
+
+#define first_cpu(src) __first_cpu(&(src), NR_CPUS)
+static inline int __first_cpu(const cpumask_t *srcp, int nbits)
+{
+	return min_t(int, nbits, find_first_bit(srcp->bits, nbits));
+}
+
+#define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS)
+static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits)
+{
+	return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1));
+}
+
+#define cpumask_of_cpu(cpu)						\
+({									\
+	typeof(_unused_cpumask_arg_) m;					\
+	if (sizeof(m) == sizeof(unsigned long)) {			\
+		m.bits[0] = 1UL<<(cpu);					\
+	} else {							\
+		cpus_clear(m);						\
+		cpu_set((cpu), m);					\
+	}								\
+	m;								\
+})
+
+#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)
+
+#if NR_CPUS <= BITS_PER_LONG
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
+
+#else
+
+#define CPU_MASK_ALL							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,			\
+	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
+} }
 
-#ifndef cpu_online_map
-extern cpumask_t cpu_online_map;
 #endif
 
-static inline int cpus_weight(cpumask_t w)
+#define CPU_MASK_NONE							\
+(cpumask_t) { {								\
+	[0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL				\
+} }
+
+#define CPU_MASK_CPU0							\
+(cpumask_t) { {								\
+	[0] =  1UL							\
+} }
+
+#define cpus_addr(src) ((src).bits)
+
+/*
+#define cpumask_scnprintf(buf, len, src) \
+			__cpumask_scnprintf((buf), (len), &(src), NR_CPUS)
+static inline int __cpumask_scnprintf(char *buf, int len,
+					const cpumask_t *srcp, int nbits)
 {
-    unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
-    res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
-    res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
-    res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
-    return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }
 
-#define cpus_addr(_m) (&(_m))
+#define cpumask_parse(ubuf, ulen, src) \
+			__cpumask_parse((ubuf), (ulen), &(src), NR_CPUS)
+static inline int __cpumask_parse(const char __user *buf, int len,
+					cpumask_t *dstp, int nbits)
+{
+	return bitmap_parse(buf, len, dstp->bits, nbits);
+}
+*/
+
+#if NR_CPUS > 1
+#define for_each_cpu_mask(cpu, mask)		\
+	for ((cpu) = first_cpu(mask);		\
+		(cpu) < NR_CPUS;		\
+		(cpu) = next_cpu((cpu), (mask)))
+#else /* NR_CPUS == 1 */
+#define for_each_cpu_mask(cpu, mask) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#endif /* NR_CPUS */
+
+/*
+ * The following particular system cpumasks and operations manage
+ * possible, present and online cpus.  Each of them is a fixed size
+ * bitmap of size NR_CPUS.
+ *
+ *  #ifdef CONFIG_HOTPLUG_CPU
+ *     cpu_possible_map - all NR_CPUS bits set
+ *     cpu_present_map  - has bit 'cpu' set iff cpu is populated
+ *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *  #else
+ *     cpu_possible_map - has bit 'cpu' set iff cpu is populated
+ *     cpu_present_map  - copy of cpu_possible_map
+ *     cpu_online_map   - has bit 'cpu' set iff cpu available to scheduler
+ *  #endif
+ *
+ *  In either case, NR_CPUS is fixed at compile time, as the static
+ *  size of these bitmaps.  The cpu_possible_map is fixed at boot
+ *  time, as the set of CPU id's that it is possible might ever
+ *  be plugged in at anytime during the life of that system boot.
+ *  The cpu_present_map is dynamic(*), representing which CPUs
+ *  are currently plugged in.  And cpu_online_map is the dynamic
+ *  subset of cpu_present_map, indicating those CPUs available
+ *  for scheduling.
+ *
+ *  If HOTPLUG is enabled, then cpu_possible_map is forced to have
+ *  all NR_CPUS bits set, otherwise it is just the set of CPUs that
+ *  ACPI reports present at boot.
+ *
+ *  If HOTPLUG is enabled, then cpu_present_map varies dynamically,
+ *  depending on what ACPI reports as currently plugged in, otherwise
+ *  cpu_present_map is just a copy of cpu_possible_map.
+ *
+ *  (*) Well, cpu_present_map is dynamic in the hotplug case.  If not
+ *      hotplug, it's a copy of cpu_possible_map, hence fixed at boot.
+ *
+ * Subtleties:
+ * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
+ *    assumption that their single CPU is online.  The UP
+ *    cpu_{online,possible,present}_maps are placebos.  Changing them
+ *    will have no useful affect on the following num_*_cpus()
+ *    and cpu_*() macros in the UP case.  This ugliness is a UP
+ *    optimization - don't waste any instructions or memory references
+ *    asking if you're online or how many CPUs there are if there is
+ *    only one CPU.
+ * 2) Most SMP arch's #define some of these maps to be some
+ *    other map specific to that arch.  Therefore, the following
+ *    must be #define macros, not inlines.  To see why, examine
+ *    the assembly code produced by the following.  Note that
+ *    set1() writes phys_x_map, but set2() writes x_map:
+ *        int x_map, phys_x_map;
+ *        #define set1(a) x_map = a
+ *        inline void set2(int a) { x_map = a; }
+ *        #define x_map phys_x_map
+ *        main(){ set1(3); set2(5); }
+ */
+
+extern cpumask_t cpu_possible_map;
+extern cpumask_t cpu_online_map;
+extern cpumask_t cpu_present_map;
+
+#if NR_CPUS > 1
+#define num_online_cpus()	cpus_weight(cpu_online_map)
+#define num_possible_cpus()	cpus_weight(cpu_possible_map)
+#define num_present_cpus()	cpus_weight(cpu_present_map)
+#define cpu_online(cpu)		cpu_isset((cpu), cpu_online_map)
+#define cpu_possible(cpu)	cpu_isset((cpu), cpu_possible_map)
+#define cpu_present(cpu)	cpu_isset((cpu), cpu_present_map)
+#else
+#define num_online_cpus()	1
+#define num_possible_cpus()	1
+#define num_present_cpus()	1
+#define cpu_online(cpu)		((cpu) == 0)
+#define cpu_possible(cpu)	((cpu) == 0)
+#define cpu_present(cpu)	((cpu) == 0)
+#endif
+
+#define any_online_cpu(mask)			\
+({						\
+	int cpu;				\
+	for_each_cpu_mask(cpu, (mask))		\
+		if (cpu_online(cpu))		\
+			break;			\
+	cpu;					\
+})
+
+#define for_each_cpu(cpu)	  for_each_cpu_mask((cpu), cpu_possible_map)
+#define for_each_online_cpu(cpu)  for_each_cpu_mask((cpu), cpu_online_map)
+#define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
 
-#endif /* __XEN_CPUMASK_H__ */
+#endif /* __XEN_CPUMASK_H */
diff --git a/xen/include/xen/kernel.h b/xen/include/xen/kernel.h
index 993a6c19cf..53a7251838 100644
--- a/xen/include/xen/kernel.h
+++ b/xen/include/xen/kernel.h
@@ -33,5 +33,29 @@
 #define max_t(type,x,y) \
         ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
 
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ *
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	typeof(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+
 #endif /* _LINUX_KERNEL_H */
 
diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h
index 7cd5295fea..47fb6452f0 100644
--- a/xen/include/xen/sched-if.h
+++ b/xen/include/xen/sched-if.h
@@ -8,7 +8,6 @@
 #ifndef __XEN_SCHED_IF_H__
 #define __XEN_SCHED_IF_H__
 
-//#define ADV_SCHED_HISTO
 #define BUCKETS  10
 /*300*/
 
@@ -19,11 +18,6 @@ struct schedule_data {
     void               *sched_priv;
     struct ac_timer     s_timer;        /* scheduling timer                */
     unsigned long       tick;           /* current periodic 'tick'         */
-#ifdef ADV_SCHED_HISTO
-    u32			to_hist[BUCKETS];
-    u32			from_hist[BUCKETS];
-    u64			save_tsc;
-#endif
 #ifdef BUCKETS
     u32                 hist[BUCKETS];  /* for scheduler latency histogram */
 #endif
@@ -39,8 +33,6 @@ struct scheduler {
     char *opt_name;         /* option name for this scheduler    */
     unsigned int sched_id;  /* ID for this scheduler             */
 
-    int          (*init_scheduler) (void);
-    int          (*init_idle_task) (struct exec_domain *);
     int          (*alloc_task)     (struct exec_domain *);
     void         (*add_task)       (struct exec_domain *);
     void         (*free_task)      (struct domain *);
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 21e4a95c38..0bfc2345b4 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -246,9 +246,6 @@ void new_thread(struct exec_domain *d,
                 unsigned long start_stack,
                 unsigned long start_info);
 
-extern unsigned long wait_init_idle;
-#define init_idle() clear_bit(smp_processor_id(), &wait_init_idle);
-
 #define set_current_state(_s) do { current->state = (_s); } while (0)
 void scheduler_init(void);
 void schedulers_start(void);
@@ -257,7 +254,6 @@ void sched_rem_domain(struct exec_domain *);
 long sched_ctl(struct sched_ctl_cmd *);
 long sched_adjdom(struct sched_adjdom_cmd *);
 int  sched_id();
-void init_idle_task(void);
 void domain_wake(struct exec_domain *d);
 void domain_sleep(struct exec_domain *d);
 
diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h
index 811e25ac24..2278308525 100644
--- a/xen/include/xen/smp.h
+++ b/xen/include/xen/smp.h
@@ -26,19 +26,19 @@ extern void smp_send_event_check_mask(unsigned long cpu_mask);
 #define smp_send_event_check_cpu(_cpu) smp_send_event_check_mask(1<<(_cpu))
 
 /*
- * Boot processor call to load the other CPU's
+ * Prepare machine for booting other CPUs.
  */
-extern void smp_boot_cpus(void);
+extern void smp_prepare_cpus(unsigned int max_cpus);
 
 /*
- * Processor call in. Must hold processors until ..
+ * Bring a CPU up
  */
-extern void smp_callin(void);
+extern int __cpu_up(unsigned int cpunum);
 
 /*
- * Multiprocessors may now schedule
+ * Final polishing of CPUs
  */
-extern void smp_commence(void);
+extern void smp_cpus_done(unsigned int max_cpus);
 
 /*
  * Call a function on all other processors
@@ -57,12 +57,6 @@ static inline int on_each_cpu(void (*func) (void *info), void *info,
     return ret;
 }
 
-/*
- * True once the per process idle is forked
- */
-extern int smp_threads_ready;
-
-extern int smp_num_cpus;
 extern int ht_per_core;
 extern int opt_noht;
 
@@ -80,6 +74,12 @@ extern volatile int smp_msg_id;
 #define MSG_RESCHEDULE		0x0003	/* Reschedule request from master CPU*/
 #define MSG_CALL_FUNCTION       0x0004  /* Call function on all other CPUs */
 
+/*
+ * Mark the boot cpu "online" so that it can call console drivers in
+ * printk() and can access its per-cpu storage.
+ */
+void smp_prepare_boot_cpu(void);
+
 #else
 
 /*
@@ -88,16 +88,14 @@ extern volatile int smp_msg_id;
 
 #define smp_send_event_check_mask(_m)           ((void)0)
 #define smp_send_event_check_cpu(_p)            ((void)0) 
-#define smp_num_cpus				1
+#ifndef __smp_processor_id
 #define smp_processor_id()			0
+#endif
 #define hard_smp_processor_id()			0
-#define smp_threads_ready			1
-#define kernel_lock()
-#define cpu_logical_map(cpu)			0
-#define cpu_number_map(cpu)			0
 #define smp_call_function(func,info,retry,wait)	0
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
-#define cpu_online_map				1
+#define num_booting_cpus()			1
+#define smp_prepare_boot_cpu()			do {} while (0)
 
 #endif
 
-- 
2.30.2